From 8b4b8d54d1f0dfbe8a7d1034b6d72dc21c08a46a Mon Sep 17 00:00:00 2001
From: bzego <bzegosha@mail.ru>
Date: Wed, 11 Mar 2026 15:27:10 +0300
Subject: [PATCH] Initial commit

---
 .gitignore                       |    3 +
 __init__.py                      |    2 +
 agent.py                         | 1184 ++++++++++++++++++
 api.py                           |   77 ++
 bundle_export.py                 |  267 ++++
 cli.py                           |  282 +++++
 db.py                            |  296 +++++
 dedup/simhash.py                 |   41 +
 extract/clean.py                 |   39 +
 extract/doc_type.py              |  134 ++
 extract/experience.py            |  159 +++
 extract/experience_timeline.py   |  144 +++
 extract/llm.py                   |  585 +++++++++
 extract/parse.py                 |  659 ++++++++++
 extract/pdf_extract.py           |  211 ++++
 extract/sections.py              |   70 ++
 extract/templates/__init__.py    |    1 +
 extract/templates/generic.py     |   46 +
 extract/templates/hh.py          |   58 +
 extract/templates/hh_ru.py       |   85 ++
 extract/templates/linkedin.py    |   57 +
 extract/templates/one_page.py    |   46 +
 extract/templates/one_page_en.py |   11 +
 extract/templates/one_page_ru.py |   11 +
 extract/templates/pptx_export.py |   45 +
 extract/text_extract.py          |   99 ++
 importers/file_scan.py           |   21 +
 importers/telegram_html.py       |   66 +
 importers/telegram_json.py       |   73 ++
 normalize.py                     |  174 +++
 pdf_merge.py                     |   45 +
 pipeline.py                      | 1990 ++++++++++++++++++++++++++++++
 search.py                        |  393 ++++++
 util.py                          |   33 +
 34 files changed, 7407 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 __init__.py
 create mode 100644 agent.py
 create mode 100644 api.py
 create mode 100644 bundle_export.py
 create mode 100644 cli.py
 create mode 100644 db.py
 create mode 100644 dedup/simhash.py
 create mode 100644 extract/clean.py
 create mode 100644 extract/doc_type.py
 create mode 100644 extract/experience.py
 create mode 100644 extract/experience_timeline.py
 create mode 100644 extract/llm.py
 create mode 100644 extract/parse.py
 create mode 100644 extract/pdf_extract.py
 create mode 100644 extract/sections.py
 create mode 100644 extract/templates/__init__.py
 create mode 100644 extract/templates/generic.py
 create mode 100644 extract/templates/hh.py
 create mode 100644 extract/templates/hh_ru.py
 create mode 100644 extract/templates/linkedin.py
 create mode 100644 extract/templates/one_page.py
 create mode 100644 extract/templates/one_page_en.py
 create mode 100644 extract/templates/one_page_ru.py
 create mode 100644 extract/templates/pptx_export.py
 create mode 100644 extract/text_extract.py
 create mode 100644 importers/file_scan.py
 create mode 100644 importers/telegram_html.py
 create mode 100644 importers/telegram_json.py
 create mode 100644 normalize.py
 create mode 100644 pdf_merge.py
 create mode 100644 pipeline.py
 create mode 100644 search.py
 create mode 100644 util.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..77ac754
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.venv/
+__pycache__/
+*.pyc
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..be35134
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,2 @@
+__all__ = []
+__version__ = "1.0.0"
diff --git a/agent.py b/agent.py
new file mode 100644
index 0000000..4bf9420
--- /dev/null
+++ b/agent.py
@@ -0,0 +1,1184 @@
+from __future__ import annotations
+
+import json
+import re
+import sqlite3
+from dataclasses import dataclass, asdict
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+try:
+    import httpx  # type: ignore
+except Exception:  # pragma: no cover
+    httpx = None  # type: ignore
+
+from tg_resume_db.search import search_with_filters
+from tg_resume_db.extract.parse import (
+    extract_remote,
+    extract_english,
+    extract_location_best_effort,
+    extract_roles_skills,
+    extract_salary,
+)
+from tg_resume_db.extract.clean import normalize_text
+from tg_resume_db.extract.llm import resolve_llm_runtime
+from tg_resume_db.normalize import normalize_skill, find_skills_in_text
+
+
+# --------- Search plan (LLM outputs THIS, not SQL) ----------
+
+@dataclass
+class SearchPlan:
+    query_text: str = ""                  # full-text query (FTS)
+    skills_any: List[str] = None          # at least one must match
+    skills_all: List[str] = None          # all must match
+    roles_any: List[str] = None
+    location: Optional[str] = None
+    remote: Optional[bool] = None
+    english_min: Optional[str] = None     # e.g. A1..C2
+    exp_years_min: Optional[float] = None
+    salary_min: Optional[int] = None
+    salary_max: Optional[int] = None
+    limit: int = 20
+    sort: str = "rank"                    # rank | exp_desc | salary_desc
+
+    def __post_init__(self):
+        self.skills_any = self.skills_any or []
+        self.skills_all = self.skills_all or []
+        self.roles_any = self.roles_any or []
+
+
+_ALLOWED_PLAN_KEYS = {
+    "query_text",
+    "skills_any",
+    "skills_all",
+    "roles_any",
+    "location",
+    "remote",
+    "english_min",
+    "exp_years_min",
+    "salary_min",
+    "salary_max",
+    "limit",
+    "sort",
+}
+
+# --------- Text helpers ----------
+
+_EN_ORDER = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}
+
+
+def _norm_token(s: str) -> str:
+    s = (s or "").strip().lower()
+    s = re.sub(r"\s+", " ", s)
+    return s
+
+
+def _uniq_keep_order(xs: List[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for x in (xs or []):
+        x = _norm_token(str(x))
+        if not x or x in seen:
+            continue
+        seen.add(x)
+        out.append(x)
+    return out
+
+
+def _filter_skills_vs_location(skills: List[str], location: Optional[str]) -> List[str]:
+    if not skills:
+        return []
+    bad = set()
+    if location:
+        bad.add(_norm_token(location))
+    for w in [
+        "москва", "санкт-петербург", "спб", "питер", "екатеринбург", "минск", "алматы",
+        "remote", "удаленно", "удалённо", "удаленка", "удалёнка", "гибрид", "hybrid",
+        "офис", "office", "onsite", "on-site",
+    ]:
+        bad.add(w)
+    return [s for s in skills if _norm_token(s) not in bad]
+
+
+# ---- Name-list detection (чтобы не ужимать фильтрами запрос "списком ФИО") ----
+_NAME_RE = re.compile(r"\b[А-ЯЁA-Z][а-яёa-z]+(?:[-\s]+[А-ЯЁA-Z][а-яёa-z]+)+\b")
+
+
+def _looks_like_name_list(user_prompt: str) -> bool:
+    """
+    Heuristic: если в запросе несколько строк с ФИО, считаем это прямой поиск по именам
+    и не жёстко фильтруем по стеку/опыту.
+    """
+    if not user_prompt:
+        return False
+    matches = _NAME_RE.findall(user_prompt)
+    if len(matches) >= 3:
+        return True
+
+    # lines with at least one full name
+    lines = [ln.strip() for ln in user_prompt.splitlines() if ln.strip()]
+    name_lines = sum(1 for ln in lines if _NAME_RE.search(ln))
+    return name_lines >= 2 and len(matches) >= 2
+
+
+# ---- Work mode: hybrid must NOT force remote=true ----
+
+_HYBRID_RE = re.compile(r"\b(гибрид|hybrid)\b", re.I)
+_REMOTE_RE = re.compile(r"\b(remote|удал(ен|ён|енно|ённо)?|удаленк|удалёнк|дистанц)\b", re.I)
+_OFFICE_RE = re.compile(r"\b(офис|office|on[-\s]?site|onsite|в офисе|на месте)\b", re.I)
+
+
+def _apply_work_mode_overrides(user_prompt: str, plan: SearchPlan) -> None:
+    """
+    Принудительно правим plan.remote по тексту запроса:
+    - "гибрид" => remote = None (не фильтруем)
+    - "офис/onsite" => remote = False
+    - "remote/удаленно" => remote = True
+    """
+    t = (user_prompt or "").lower()
+
+    if _HYBRID_RE.search(t):
+        plan.remote = None
+        return
+    if _OFFICE_RE.search(t):
+        plan.remote = False
+        return
+    if _REMOTE_RE.search(t):
+        plan.remote = True
+        return
+
+
+def _simplify_query_text(user_prompt: str, skills_any: List[str]) -> str:
+    """
+    FTS-поиск может ухудшаться, если query_text перегружен.
+    Если в запросе явно стек (3+ технологий) — оставим краткий search intent.
+    """
+    up = (user_prompt or "").strip()
+    if len(skills_any) >= 3:
+        # максимально безопасно и универсально
+        if re.search(r"\bbackend\b", up, re.I) or "бэкенд" in up.lower():
+            return "backend developer"
+        return "developer"
+    return up
+
+
+# --------- sanitize helpers ----------
+
+def _as_list(v: Any) -> List[str]:
+    if v is None:
+        return []
+    if isinstance(v, list):
+        return [str(x) for x in v if str(x).strip()]
+    s = str(v).strip()
+    if not s:
+        return []
+    return [x.strip() for x in s.split(",") if x.strip()]
+
+
+def _to_bool(v: Any) -> Optional[bool]:
+    if v is None:
+        return None
+    if isinstance(v, bool):
+        return v
+    s = str(v).strip().lower()
+    if s in ("true", "1", "yes", "y", "да", "д"):
+        return True
+    if s in ("false", "0", "no", "n", "нет", "н"):
+        return False
+    return None
+
+
+def _to_int(v: Any) -> Optional[int]:
+    if v is None:
+        return None
+    try:
+        return int(float(v))
+    except Exception:
+        return None
+
+
+def _to_float(v: Any) -> Optional[float]:
+    if v is None:
+        return None
+    try:
+        return float(v)
+    except Exception:
+        return None
+
+
+def _sanitize_plan_dict(obj: Any) -> Dict[str, Any]:
+    """
+    Убираем лишние ключи (например, user_prompt) и приводим типы.
+    Лечит: SearchPlan.__init__() got an unexpected keyword argument ...
+    """
+    if not isinstance(obj, dict):
+        return {}
+
+    clean: Dict[str, Any] = {}
+    for k, v in obj.items():
+        if k not in _ALLOWED_PLAN_KEYS:
+            continue
+        clean[k] = v
+
+    if "skills_any" in clean:
+        clean["skills_any"] = _as_list(clean["skills_any"])
+    if "skills_all" in clean:
+        clean["skills_all"] = _as_list(clean["skills_all"])
+    if "roles_any" in clean:
+        clean["roles_any"] = _as_list(clean["roles_any"])
+
+    if "remote" in clean:
+        clean["remote"] = _to_bool(clean["remote"])
+
+    if "salary_min" in clean:
+        clean["salary_min"] = _to_int(clean["salary_min"])
+    if "salary_max" in clean:
+        clean["salary_max"] = _to_int(clean["salary_max"])
+
+    if "exp_years_min" in clean:
+        clean["exp_years_min"] = _to_float(clean["exp_years_min"])
+
+    if "limit" in clean:
+        lim = _to_int(clean["limit"])
+        clean["limit"] = lim if lim is not None else 20
+
+    if "sort" in clean:
+        clean["sort"] = str(clean["sort"] or "").strip()
+
+    if "location" in clean and clean["location"] is not None:
+        loc = str(clean["location"]).strip()
+        clean["location"] = loc if loc else None
+
+    if "english_min" in clean and clean["english_min"] is not None:
+        eng = str(clean["english_min"]).strip().upper()
+        clean["english_min"] = eng if eng else None
+
+    if "query_text" in clean and clean["query_text"] is not None:
+        clean["query_text"] = str(clean["query_text"]).strip()
+
+    return clean
+
+
+# --------- heuristic plan ----------
+
+def _heuristic_plan(user_prompt: str) -> SearchPlan:
+    # Если запрос похож на список имён — ищем по тексту без лишних фильтров
+    if _looks_like_name_list(user_prompt):
+        return SearchPlan(
+            query_text=user_prompt.strip(),
+            skills_any=[],
+            skills_all=[],
+            roles_any=[],
+            location=None,
+            remote=None,
+            english_min=None,
+            exp_years_min=None,
+            salary_min=None,
+            salary_max=None,
+            limit=20,
+            sort="rank",
+        )
+
+    text = normalize_text(user_prompt)
+
+    roles, skills = extract_roles_skills(text)
+    location = extract_location_best_effort(text)
+    remote = extract_remote(text)
+    english = extract_english(text)
+    sal_min, sal_max, sal_conf, _ = extract_salary(text)
+
+    skills = _filter_skills_vs_location(skills, location)
+    roles = _uniq_keep_order(roles)
+    skills = _uniq_keep_order(skills)
+
+    plan = SearchPlan(
+        query_text=_simplify_query_text(user_prompt, skills),
+        skills_any=skills[:12],
+        roles_any=(["backend"] if ("backend" in roles or "backend" in user_prompt.lower()) else roles[:6]),
+        location=location,
+        remote=remote,
+        english_min=english,
+        salary_min=sal_min if sal_conf and sal_conf >= 0.4 else None,
+        salary_max=sal_max if sal_conf and sal_conf >= 0.4 else None,
+        limit=20,
+        sort="rank",
+    )
+
+    _apply_work_mode_overrides(user_prompt, plan)
+    return plan
+
+
+# --------- Optional LLM (OpenAI-compatible base_url) ----------
+
+def _llm_enabled() -> bool:
+    if httpx is None:
+        return False
+    runtime = resolve_llm_runtime()
+    return bool(runtime.get("base_url")) and bool(runtime.get("model"))
+
+
+def _llm_call_json(messages: List[Dict[str, str]]) -> Dict[str, Any]:
+    if httpx is None:
+        raise RuntimeError("httpx is not installed")
+
+    runtime = resolve_llm_runtime()
+    base_url = runtime.get("base_url", "").rstrip("/")
+    model = runtime.get("model", "")
+    api_key = runtime.get("api_key", "")
+    if not base_url or not model:
+        raise RuntimeError("LLM runtime is not configured")
+
+    payload = {"model": model, "messages": messages, "temperature": 0.2}
+
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    with httpx.Client(timeout=30.0) as client:
+        r = client.post(f"{base_url}/chat/completions", headers=headers, json=payload)
+        r.raise_for_status()
+        data = r.json()
+
+    content = data["choices"][0]["message"]["content"]
+    m = re.search(r"\{.*\}", content, flags=re.S)
+    if not m:
+        raise ValueError("LLM did not return JSON")
+    return json.loads(m.group(0))
+
+
+def _llm_build_plan(user_prompt: str, draft: SearchPlan) -> SearchPlan:
+    schema_hint = {
+        "query_text": "string",
+        "skills_any": ["string"],
+        "skills_all": ["string"],
+        "roles_any": ["string"],
+        "location": "string|null",
+        "remote": "bool|null",
+        "english_min": "A1|A2|B1|B2|C1|C2|null",
+        "exp_years_min": "number|null",
+        "salary_min": "int|null",
+        "salary_max": "int|null",
+        "limit": "int",
+        "sort": "rank|exp_desc|salary_desc",
+    }
+
+    msgs = [
+        {
+            "role": "system",
+            "content": (
+                "Ты превращаешь запрос рекрутера в JSON-фильтры поиска по базе резюме.\n"
+                "НЕЛЬЗЯ писать SQL. Верни ТОЛЬКО JSON объекта SearchPlan.\n"
+                f"Schema: {json.dumps(schema_hint, ensure_ascii=False)}\n"
+                "ВАЖНО:\n"
+                "- Никаких лишних ключей - только поля Schema.\n"
+                "- Не добавляй в skills города/локации.\n"
+                "- 'гибрид' НЕ означает remote=true (если видишь 'гибрид' - remote=null).\n"
+                "- Старайся делать поиск широким: skills_all используй ТОЛЬКО если явно попросили обязательные навыки.\n"
+                "- Если в запросе есть указание уровня английского (например B2+), заполни english_min.\n"
+                "- Если явно указан опыт 'N+' лет - поставь exp_years_min=N.\n"
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                f"Запрос: {user_prompt}\n\n"
+                f"Черновик (эвристика): {json.dumps(asdict(draft), ensure_ascii=False)}"
+            ),
+        },
+    ]
+
+    obj_raw = _llm_call_json(msgs)
+    obj = _sanitize_plan_dict(obj_raw)
+
+    plan = SearchPlan(**{**asdict(draft), **obj})
+
+    plan.skills_any = _uniq_keep_order(_filter_skills_vs_location(plan.skills_any, plan.location))
+    plan.skills_all = _uniq_keep_order(_filter_skills_vs_location(plan.skills_all, plan.location))
+    plan.roles_any = _uniq_keep_order(plan.roles_any)
+
+    # мягко улучшим query_text
+    plan.query_text = _simplify_query_text(user_prompt, plan.skills_any)
+
+    plan.limit = max(5, min(int(plan.limit or 20), 50))
+    if plan.sort not in ("rank", "exp_desc", "salary_desc"):
+        plan.sort = "rank"
+
+    # fallback: если LLM обнулил важные поля - вернём эвристику
+    if not plan.skills_any:
+        plan.skills_any = draft.skills_any
+    if not plan.skills_all:
+        plan.skills_all = draft.skills_all
+    if plan.english_min is None and draft.english_min is not None:
+        plan.english_min = draft.english_min
+    if plan.exp_years_min is None:
+        try:
+            req_exp = _extract_required_exp_years(user_prompt)
+            if req_exp is not None:
+                plan.exp_years_min = req_exp
+        except Exception:
+            pass
+
+    _apply_work_mode_overrides(user_prompt, plan)
+
+    return plan
+
+
+# --------- post processing: dedupe + "real fit" filter ----------
+
+_CORE = {"java", "kotlin", "python", "go", "golang"}
+_BONUS = {"c++", "cpp"}
+
+_LANG_VARIANTS = {
+    "java": {"java"},
+    "kotlin": {"kotlin"},
+    "python": {"python"},
+    "go": {"go", "golang"},
+    "c++": {"c++", "cpp", "c plus plus"},
+    "c#": {"c#", "csharp"},
+}
+
+_SKILL_EVIDENCE_ALIASES = {
+    "go": {"go", "golang"},
+    "golang": {"go", "golang"},
+    "kubernetes": {"kubernetes", "k8s"},
+    "postgresql": {"postgresql", "postgres", "postgre sql", "postgre-sql", "psql"},
+    "javascript": {"javascript", "java script", "js"},
+    "typescript": {"typescript", "type script", "ts"},
+    "nodejs": {"nodejs", "node js", "node.js", "node"},
+    "grpc": {"grpc", "g rpc"},
+    "graphql": {"graphql", "graph ql"},
+    "ci/cd": {"ci/cd", "ci cd", "cicd"},
+    "c++": {"c++", "cpp", "c plus plus"},
+    "c#": {"c#", "csharp", "c sharp"},
+    "dotnet": {"dotnet", ".net"},
+    "aws": {"aws", "amazon web services"},
+    "gcp": {"gcp", "google cloud", "google cloud platform"},
+    "redis": {"redis"},
+    "kafka": {"kafka"},
+    "docker": {"docker"},
+}
+
+_GENERIC_SKIP_SKILLS = {
+    "backend",
+    "frontend",
+    "fullstack",
+    "developer",
+    "engineer",
+    "senior",
+    "middle",
+    "junior",
+    "lead",
+}
+
+_DOMAIN_VARIANTS = {
+    "fintech": {
+        "fintech",
+        "финтех",
+        "bank",
+        "banking",
+        "бан",
+        "payment",
+        "payments",
+        "card",
+        "cards",
+        "sber",
+        "тбанк",
+        "tinkoff",
+        "visa",
+        "mastercard",
+        "trading",
+        "exchange",
+        "crypto",
+        "крипт",
+        "биржа",
+    },
+    "ecommerce": {
+        "ecommerce",
+        "e-commerce",
+        "marketplace",
+        "retail",
+        "checkout",
+        "cart",
+        "онлайн магазин",
+    },
+    "gamedev": {"gamedev", "game dev", "gaming", "unity", "unreal", "игр"},
+    "healthcare": {"healthcare", "medtech", "hospital", "clinic", "мед", "health tech"},
+}
+
+
+def _token_in_text(text: str, token: str) -> bool:
+    if not text or not token:
+        return False
+    pat = r"(?<![a-z0-9+#])" + re.escape(token) + r"(?![a-z0-9+#])"
+    return re.search(pat, text, re.I) is not None
+
+
+def _lang_in_text(text: str, canon_lang: str) -> bool:
+    aliases = _LANG_VARIANTS.get(canon_lang, {canon_lang})
+    for tok in aliases:
+        if _token_in_text(text, tok):
+            return True
+    return False
+
+
+def _skill_aliases(skill: str) -> List[str]:
+    canon = normalize_skill(skill) or _norm_token(skill)
+    if not canon:
+        return []
+
+    aliases = set()
+    aliases.add(canon)
+    aliases.add(_norm_token(skill))
+    aliases.update(_SKILL_EVIDENCE_ALIASES.get(canon, set()))
+    if canon in _LANG_VARIANTS:
+        aliases.update(_LANG_VARIANTS.get(canon, set()))
+
+    out: List[str] = []
+    for a in aliases:
+        t = _norm_token(a)
+        if not t:
+            continue
+        out.append(t)
+    return _uniq_keep_order(out)
+
+
+def _extract_required_skills(user_prompt: str, plan: Optional[SearchPlan], req_langs: Set[str]) -> List[str]:
+    raw: List[str] = []
+    if plan:
+        raw.extend(plan.skills_all or [])
+        raw.extend(plan.skills_any or [])
+    raw.extend(find_skills_in_text(user_prompt or ""))
+    raw.extend(list(req_langs or set()))
+
+    out: List[str] = []
+    seen = set()
+    for s in raw:
+        canon = normalize_skill(s) or _norm_token(s)
+        if not canon:
+            continue
+        canon = _norm_token(canon)
+        if canon in _GENERIC_SKIP_SKILLS:
+            continue
+        if canon in seen:
+            continue
+        seen.add(canon)
+        out.append(canon)
+    return out[:10]
+
+
+def _query_stack_is_strict(user_prompt: str) -> bool:
+    t = (user_prompt or "").lower()
+    if any(w in t for w in ("обязательно", "строго", "must", "required", "mandatory", "без этого")):
+        return True
+    if "," in t and " или " not in t and " or " not in t:
+        return True
+    return False
+
+
+def _extract_required_domains(user_prompt: str) -> List[str]:
+    t = (user_prompt or "").lower()
+    out: List[str] = []
+    for canon, variants in _DOMAIN_VARIANTS.items():
+        if any(v in t for v in variants):
+            out.append(canon)
+    return out
+
+
+def _domain_hit(text: str, domain: str) -> bool:
+    variants = _DOMAIN_VARIANTS.get(domain, set())
+    txt = (text or "").lower()
+    return any(v in txt for v in variants)
+
+
+def _load_resume_contexts(
+    con: sqlite3.Connection,
+    items: List[Dict[str, Any]],
+) -> Dict[str, Dict[str, str]]:
+    resume_ids = []
+    seen = set()
+    for it in items or []:
+        rid = str(it.get("resume_id") or "").strip()
+        if not rid or rid in seen:
+            continue
+        seen.add(rid)
+        resume_ids.append(rid)
+
+    if not resume_ids:
+        return {}
+
+    ph = ",".join("?" for _ in resume_ids)
+    sql = (
+        f"SELECT resume_id, clean_text, sections_json, extraction_json "
+        f"FROM resumes WHERE resume_id IN ({ph})"
+    )
+    try:
+        rows = con.execute(sql, resume_ids).fetchall()
+    except Exception:
+        return {}
+
+    out: Dict[str, Dict[str, str]] = {}
+    for r in rows:
+        rid = str(r["resume_id"])
+        clean = str(r["clean_text"] or "")
+
+        sections: Dict[str, Any] = {}
+        try:
+            raw = json.loads(r["sections_json"] or "{}")
+            if isinstance(raw, dict):
+                sections = raw
+        except Exception:
+            sections = {}
+
+        extraction: Dict[str, Any] = {}
+        try:
+            raw = json.loads(r["extraction_json"] or "{}")
+            if isinstance(raw, dict):
+                extraction = raw
+        except Exception:
+            extraction = {}
+
+        skills_text = str(sections.get("skills") or "")
+        body_parts: List[str] = []
+        for key in ("about", "summary", "experience", "projects", "work"):
+            val = sections.get(key)
+            if val:
+                body_parts.append(str(val))
+
+        for p in extraction.get("positions") or []:
+            if not isinstance(p, dict):
+                continue
+            body_parts.append(str(p.get("title") or ""))
+            body_parts.append(str(p.get("company") or ""))
+            body_parts.append(str(p.get("description") or ""))
+
+        body_text = "\n".join(body_parts).strip()
+
+        # fallback for badly split templates
+        if len(body_text) < 80:
+            body_text = clean
+            if skills_text:
+                body_text = body_text.replace(skills_text, " ")
+
+        out[rid] = {
+            "skills_text": skills_text.lower(),
+            "body_text": body_text.lower(),
+            "clean_text": clean.lower(),
+        }
+
+    return out
+
+
+def _normalize_lang_token(token: str) -> Optional[str]:
+    t = _norm_token(token)
+    if not t:
+        return None
+    for canon, aliases in _LANG_VARIANTS.items():
+        if t == canon or t in aliases:
+            return canon
+    return None
+
+
+def _extract_required_languages(user_prompt: str) -> List[str]:
+    t = (user_prompt or "").lower()
+    hits: List[str] = []
+    for canon, aliases in _LANG_VARIANTS.items():
+        if any(_token_in_text(t, alias) for alias in aliases):
+            if canon not in hits:
+                hits.append(canon)
+    return hits
+
+
+def _dedupe_by_candidate_best_rank(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    best: Dict[str, Dict[str, Any]] = {}
+    for it in items or []:
+        cid = it.get("candidate_id") or ""
+        if not cid:
+            continue
+        if cid not in best:
+            best[cid] = it
+            continue
+        # rank: у тебя чем меньше (более отрицательный), тем "выше"
+        r_new = it.get("rank")
+        r_old = best[cid].get("rank")
+        try:
+            if r_old is None or (r_new is not None and float(r_new) < float(r_old)):
+                best[cid] = it
+        except Exception:
+            pass
+    return list(best.values())
+
+
+def _needs_postfilter(user_prompt: str) -> bool:
+    """
+    Включаем строгий "вакансионный" фильтр, если запрос похож на вакансию:
+    - "опыт от N лет" или "5+"
+    - явный стек из языков
+    """
+    if _looks_like_name_list(user_prompt):
+        return False
+
+    t = (user_prompt or "").lower()
+    if re.search(r"(опыт|experience).{0,20}(\d+)\s*\+|\b(\d+)\s*\+\s*лет", t):
+        return True
+    skill_hits = len(find_skills_in_text(t))
+    if skill_hits >= 2:
+        return True
+    if _extract_required_domains(user_prompt) and skill_hits >= 1:
+        return True
+    # stack words fallback
+    hits = 0
+    for w in ("java", "kotlin", "python", "go", "golang", "c++", "cpp"):
+        if w in t:
+            hits += 1
+    return hits >= 2
+
+
+_EXCLUDE_LOC_MARKERS = {
+    "россия",
+    "russia",
+    "rf",
+    "russian federation",
+    "moscow",
+    "москва",
+    "москв",
+    "spb",
+    "petersburg",
+    "петербург",
+    "санкт",
+    "мск",
+    "belarus",
+    "беларусь",
+    "белоруссия",
+    "iran",
+    "ирак",
+    "iraq",
+    "пакистан",
+    "pakistan",
+    "india",
+    "индия",
+    "африк",
+}
+
+
+def _location_exclusion_requested(user_prompt: str) -> bool:
+    t = (user_prompt or "").lower()
+    return any(k in t for k in _EXCLUDE_LOC_MARKERS) and ("кроме" in t or "except" in t or "не " in t)
+
+
+def _extract_required_exp_years(user_prompt: str) -> Optional[float]:
+    t = (user_prompt or "").lower()
+    m = re.search(r"(опыт|experience).{0,20}(\d+(?:[.,]\d+)?)\s*(?:лет|years?)", t)
+    if m:
+        try:
+            return float(m.group(2).replace(",", "."))
+        except Exception:
+            return None
+    m = re.search(r"\b(\d+(?:[.,]\d+)?)\s*\+\s*(?:лет|years?)\b", t)
+    if m:
+        try:
+            return float(m.group(1).replace(",", "."))
+        except Exception:
+            return None
+    return None
+
+
+def _extract_required_english(user_prompt: str) -> Optional[str]:
+    t = (user_prompt or "").upper()
+    m = re.search(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", t)
+    if m:
+        return m.group(1).replace("+", "")
+    # textual
+    if "FLUENT" in t or "ADVANCED" in t or "PROFICIENT" in t:
+        return "C1"
+    if "UPPER" in t and "INTERMEDIATE" in t:
+        return "B2"
+    if "INTERMEDIATE" in t:
+        return "B1"
+    return None
+
+
+def _jobfit_filter_items(
+    con: sqlite3.Connection,
+    user_prompt: str,
+    items: List[Dict[str, Any]],
+    plan: Optional[SearchPlan] = None,
+) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+    """
+    "По-взрослому":
+    - exp >= required (если указано)
+    - совпадает стек (минимум 1 язык из запроса/Core)
+    - обязательный основной Go (для этого запроса с Go)
+    - английский >= требуемого уровня
+    - backend не обязателен, но учитывается в сортировке
+    """
+    req_exp = _extract_required_exp_years(user_prompt)  # например 5.0
+    req_langs = set(_extract_required_languages(user_prompt))
+    req_english = _extract_required_english(user_prompt)
+    req_skills = _extract_required_skills(user_prompt, plan, req_langs)
+    req_domains = _extract_required_domains(user_prompt)
+    strict_stack = _query_stack_is_strict(user_prompt) or (req_exp is not None)
+
+    must_have_skills = _uniq_keep_order([normalize_skill(s) or s for s in ((plan.skills_all or []) if plan else [])])
+    if not must_have_skills and strict_stack and req_skills:
+        # Для коротких стеков в вакансии считаем все элементы обязательными.
+        if len(req_skills) <= 4:
+            must_have_skills = req_skills
+        else:
+            must_have_skills = req_skills[:4]
+
+    filtered: List[Dict[str, Any]] = []
+    dropped = 0
+    reasons: Dict[str, int] = {}
+
+    exclude_ru = _location_exclusion_requested(user_prompt)
+    # Если явно ищут Go и нет числа лет — зададим минимум 4 года как дефолт
+    if req_exp is None and ("go" in req_langs or "golang" in req_langs):
+        req_exp = 4.0
+
+    resume_ctx = _load_resume_contexts(con, items)
+
+    for it in items or []:
+        roles = set((it.get("roles") or []))
+        skills = set(_norm_token(s) for s in (it.get("skills") or []))
+        for pl in it.get("primary_languages") or []:
+            skills.add(_norm_token(pl))
+
+        # CHECK ENGINEERING EXPERIENCE FIRST
+        # If 'experience_years_eng' is available and distinct (not None), use it. 
+        # Otherwise fallback to 'experience_years'.
+        exp_eng = it.get("experience_years_eng")
+        exp_total = it.get("experience_years")
+        
+        # Prefer engineering years for filtering if available
+        exp_val = None
+        if exp_eng is not None:
+            try:
+                exp_val = float(exp_eng)
+            except: 
+                pass
+        
+        if exp_val is None and exp_total is not None:
+             try:
+                exp_val = float(exp_total)
+             except:
+                pass
+
+        if req_exp is not None and (exp_val is None or exp_val < req_exp):
+            dropped += 1
+            reasons["exp_lt_required"] = reasons.get("exp_lt_required", 0) + 1
+            continue
+
+        backend_focus_flag = it.get("backend_focus")
+
+        loc = (it.get("location") or "").lower()
+        if exclude_ru and any(bad in loc for bad in _EXCLUDE_LOC_MARKERS):
+            dropped += 1
+            reasons["location_excluded"] = reasons.get("location_excluded", 0) + 1
+            continue
+
+        lang_tokens: Set[str] = set()
+        for lang in (it.get("primary_languages") or []):
+            norm = _normalize_lang_token(lang)
+            if norm:
+                lang_tokens.add(norm)
+        if not lang_tokens:
+            for sk in skills:
+                norm = _normalize_lang_token(sk)
+                if norm:
+                    lang_tokens.add(norm)
+
+        # Для language-стека оставляем базовую проверку.
+        missing_primary_lang = False
+        for req_lang in req_langs:
+            if req_lang not in lang_tokens and req_lang in ("go", "python", "java", "kotlin", "c++", "c#"):
+                missing_primary_lang = True
+                break
+        if missing_primary_lang:
+            dropped += 1
+            reasons["no_primary_required_lang"] = reasons.get("no_primary_required_lang", 0) + 1
+            continue
+
+        rid = str(it.get("resume_id") or "")
+        ctx = resume_ctx.get(rid) or {}
+        ctx_body = str(ctx.get("body_text") or "")
+        ctx_skills = str(ctx.get("skills_text") or "")
+        ctx_clean = str(ctx.get("clean_text") or "")
+        ctx_domain = "\n".join([ctx_body, ctx_clean, str(it.get("snippet") or "").lower()])
+
+        # Evidence-based skill validation (не только Go):
+        # must-have скиллы не должны быть только в section "skills".
+        skill_hits_total = 0
+        skill_hits_body = 0
+        missing_must = 0
+        skills_only_must = 0
+        skills_only_critical = 0
+        for req_skill in req_skills:
+            aliases = _skill_aliases(req_skill)
+            if not aliases:
+                continue
+            hit_body = any(_token_in_text(ctx_body, a) for a in aliases)
+            hit_skills = any(_token_in_text(ctx_skills, a) for a in aliases)
+            hit_any = hit_body or hit_skills or any(_norm_token(req_skill) == _norm_token(s) for s in skills)
+            if hit_any:
+                skill_hits_total += 1
+            if hit_body:
+                skill_hits_body += 1
+
+            if req_skill in must_have_skills:
+                if not hit_any:
+                    missing_must += 1
+                elif not hit_body and hit_skills:
+                    skills_only_must += 1
+                    if _normalize_lang_token(req_skill) is not None:
+                        skills_only_critical += 1
+
+        if missing_must > 0:
+            dropped += 1
+            reasons["required_skill_missing"] = reasons.get("required_skill_missing", 0) + 1
+            continue
+
+        # Строго режем, если ключевой language-требование есть только в skill-list,
+        # либо если весь must-have стек не подтвержден опытом.
+        if strict_stack and (skills_only_critical > 0 or (must_have_skills and skills_only_must >= len(must_have_skills))):
+            dropped += 1
+            reasons["required_skill_only_in_skills"] = reasons.get("required_skill_only_in_skills", 0) + 1
+            continue
+
+        if req_skills and strict_stack:
+            min_hits = len(must_have_skills) if must_have_skills else (2 if len(req_skills) >= 2 else 1)
+            if skill_hits_total < min_hits:
+                dropped += 1
+                reasons["required_skills_weak"] = reasons.get("required_skills_weak", 0) + 1
+                continue
+
+        domain_hits = 0
+        for d in req_domains:
+            if _domain_hit(ctx_domain, d):
+                domain_hits += 1
+        if req_domains and strict_stack and domain_hits < len(req_domains):
+            dropped += 1
+            reasons["domain_mismatch"] = reasons.get("domain_mismatch", 0) + 1
+            continue
+
+        if req_langs:
+            lang_hits_req = len(lang_tokens & req_langs)
+            if lang_hits_req < 1:
+                dropped += 1
+                reasons["lang_stack_weak"] = reasons.get("lang_stack_weak", 0) + 1
+                continue
+        else:
+            lang_hits_req = None
+
+        core_hits = len(lang_tokens & _CORE)
+        bonus_hits = len(lang_tokens & _BONUS)
+
+        # Требуем хотя бы один язык из CORE/bonus
+        if core_hits + bonus_hits < 1:
+            dropped += 1
+            reasons["stack_too_weak"] = reasons.get("stack_too_weak", 0) + 1
+            continue
+
+        it2 = dict(it)
+        it2["_fit"] = {
+            "core_hits": core_hits,
+            "bonus_cpp": bool(bonus_hits),
+            "req_lang_hits": lang_hits_req,
+            "req_skill_hits": skill_hits_total,
+            "req_skill_hits_body": skill_hits_body,
+            "req_domain_hits": domain_hits,
+            "backend_role": "backend" in roles,
+            "backend_focus": backend_focus_flag,
+        }
+        if req_english:
+            lvl = str(it.get("english_level") or "").upper()
+            if not lvl or _EN_ORDER.get(lvl, 0) < _EN_ORDER.get(req_english, 0):
+                dropped += 1
+                reasons["english_below_required"] = reasons.get("english_below_required", 0) + 1
+                continue
+
+        filtered.append(it2)
+
+    # сорт: больше core_hits, затем rank
+    def key(x: Dict[str, Any]):
+        fit = x.get("_fit") or {}
+        core_hits = int(fit.get("core_hits", 0))
+        bonus = 1 if fit.get("bonus_cpp") else 0
+        backend_bonus = 1 if fit.get("backend_role") or fit.get("backend_focus") else 0
+        req_skill_hits = int(fit.get("req_skill_hits", 0))
+        req_skill_hits_body = int(fit.get("req_skill_hits_body", 0))
+        req_domain_hits = int(fit.get("req_domain_hits", 0))
+        r = x.get("rank")
+        try:
+            r = float(r)
+        except Exception:
+            r = 0.0
+        # ручной скоринг по доменным признакам
+        score = 0.0
+        if "go" in (x.get("primary_languages") or []):
+            score += 5.0  # основной Go
+        try:
+            if x.get("experience_years_eng") and float(x.get("experience_years_eng")) >= max(4.0, req_exp or 0):
+                score += 3.0
+        except Exception:
+            pass
+        skills = set(_norm_token(s) for s in (x.get("skills") or []))
+        text_boost = 0.0
+        for kw in ("kubernetes", "k8s"):
+            if kw in skills:
+                text_boost += 1.5; break
+        for kw in ("ddd", "domain-driven design", "eda", "event-driven"):
+            if kw in skills:
+                text_boost += 2.0; break
+        for kw in ("fintech", "trading", "crypto", "exchange", "биржа", "финтех"):
+            if kw in skills:
+                text_boost += 2.5; break
+        snippet = (x.get("snippet") or "").lower()
+        for kw in ("highload", "high-load", "high throughput", "high-throughput", "low latency", "low-latency", "highload"):
+            if kw in snippet:
+                text_boost += 1.5
+                break
+        score += text_boost
+        return (-req_domain_hits, -req_skill_hits_body, -req_skill_hits, -core_hits, -backend_bonus, -bonus, -(score), r)
+
+    filtered.sort(key=key)
+
+    dbg = {
+        "postfilter_applied": True,
+        "required_exp": req_exp,
+        "required_languages": sorted(list(req_langs)),
+        "required_skills": req_skills,
+        "must_have_skills": must_have_skills,
+        "required_domains": req_domains,
+        "strict_stack": strict_stack,
+        "dropped": dropped,
+        "reasons": reasons,
+    }
+    return filtered, dbg
+
+
+# --------- Refinement loop ----------
+
+def _refine_plan_no_llm(plan: SearchPlan, result_count: int, user_prompt: str) -> SearchPlan:
+    p = SearchPlan(**asdict(plan))
+
+    if result_count == 0:
+        p.location = None
+        p.salary_min = None
+        p.salary_max = None
+        p.english_min = None
+
+        # если было строго по remote — ослабим; потом override применим обратно
+        p.remote = None
+
+        # опыт уменьшаем плавно
+        if p.exp_years_min is not None:
+            p.exp_years_min = max(0.0, float(p.exp_years_min) - 1.0)
+
+        if not (p.query_text or "").strip():
+            p.query_text = " ".join(p.skills_any[:8])
+
+        _apply_work_mode_overrides(user_prompt, p)
+        return p
+
+    return p
+
+
+def agent_search(
+    con: sqlite3.Connection,
+    user_prompt: str,
+    max_iters: int = 2,
+    limit: int = 20,
+) -> Dict[str, Any]:
+    draft = _heuristic_plan(user_prompt)
+    draft.limit = limit
+
+    names_only_query = _looks_like_name_list(user_prompt)
+    plan = _llm_build_plan(user_prompt, draft) if (_llm_enabled() and not names_only_query) else draft
+    plan.limit = limit
+
+    history: List[Dict[str, Any]] = []
+    final_items: List[Dict[str, Any]] = []
+    final_count = 0
+
+    for i in range(max_iters + 1):
+        _apply_work_mode_overrides(user_prompt, plan)
+
+        res = search_with_filters(con, plan)
+        items = res.get("items", [])
+        count = int(res.get("count", len(items)))
+
+        history.append(
+            {
+                "plan": asdict(plan),
+                "count": count,
+                "top_snippets": [it.get("snippet", "")[:180] for it in items[:5]],
+            }
+        )
+
+        if count > 0 or i == max_iters:
+            final_items = items
+            final_count = count
+            break
+
+        # refine
+        if _llm_enabled():
+            msgs = [
+                {
+                    "role": "system",
+                    "content": (
+                        "Ты корректируешь JSON SearchPlan. Верни ТОЛЬКО JSON с полями SearchPlan.\n"
+                        "Если 0 результатов — ослабь фильтры: remote=null, exp_years_min уменьшить/обнулить, "
+                        "location/salary/english убрать. skills_any сохранить.\n"
+                        "Никаких лишних ключей. Помни: 'гибрид' НЕ означает remote=true.\n"
+                    ),
+                },
+                {
+                    "role": "user",
+                    "content": json.dumps(
+                        {
+                            "query": user_prompt,
+                            "previous_plan": asdict(plan),
+                            "result_count": count,
+                        },
+                        ensure_ascii=False,
+                    ),
+                },
+            ]
+
+            obj_raw = _llm_call_json(msgs)
+            obj = _sanitize_plan_dict(obj_raw)
+
+            plan = SearchPlan(**{**asdict(plan), **obj})
+
+            plan.skills_any = _uniq_keep_order(_filter_skills_vs_location(plan.skills_any, plan.location))
+            plan.skills_all = _uniq_keep_order(_filter_skills_vs_location(plan.skills_all, plan.location))
+            plan.roles_any = _uniq_keep_order(plan.roles_any)
+
+            plan.query_text = _simplify_query_text(user_prompt, plan.skills_any)
+            plan.limit = limit
+            if plan.sort not in ("rank", "exp_desc", "salary_desc"):
+                plan.sort = "rank"
+
+            _apply_work_mode_overrides(user_prompt, plan)
+        else:
+            plan = _refine_plan_no_llm(plan, count, user_prompt)
+            plan.limit = limit
+
+    # ---- 1) dedupe ----
+    deduped = _dedupe_by_candidate_best_rank(final_items)
+
+    # ---- 2) postfilter for vacancy-like queries ----
+    post_dbg: Dict[str, Any] = {"postfilter_applied": False}
+    if _needs_postfilter(user_prompt):
+        filtered, post_dbg = _jobfit_filter_items(con, user_prompt, deduped, plan=plan)
+    else:
+        filtered = deduped
+
+    return {
+        "plan": asdict(plan),
+        "items": filtered,
+        "count": len(filtered),
+        "history": history,
+        "llm_used": _llm_enabled(),
+        "postfilter": post_dbg,
+    }
diff --git a/api.py b/api.py
new file mode 100644
index 0000000..7e74a77
--- /dev/null
+++ b/api.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, Optional
+
+from fastapi import FastAPI
+from pydantic import BaseModel, Field
+
+from tg_resume_db.db import connect, init_db
+from tg_resume_db.agent import agent_search
+from tg_resume_db.search import search as db_search
+
+DB_PATH = os.environ.get("CANDIDATES_DB", "./candidates.db")
+
+app = FastAPI(title="Resume Search API", version="1.0")
+
+class SearchRequest(BaseModel):
+    query: str = Field(default="")
+    limit: int = Field(default=20, ge=1, le=100)
+    offset: int = Field(default=0, ge=0)
+    remote: Optional[bool] = None
+    location: Optional[str] = None
+    experience_min: Optional[float] = None
+    salary_min: Optional[int] = None
+    salary_max: Optional[int] = None
+    english: Optional[str] = None
+    role: Optional[str] = None
+    skill: Optional[str] = None
+
+
+class AISearchRequest(BaseModel):
+    prompt: str = Field(default="")
+    limit: int = Field(default=20, ge=1, le=100)
+    ai_iters: int = Field(default=2, ge=0, le=5)
+
+
+@app.on_event("startup")
+def _startup():
+    con = connect(DB_PATH)
+    init_db(con)
+    con.close()
+
+@app.get("/health")
+def health():
+    return {"ok": True}
+
+@app.post("/search")
+def search(req: SearchRequest) -> Dict[str, Any]:
+    con = connect(DB_PATH)
+    try:
+        items = db_search(con, query=req.query, filters=req.model_dump(), limit=req.limit, offset=req.offset)
+        return {"items": items, "count": len(items)}
+    finally:
+        con.close()
+
+
+@app.post("/search/ai")
+def search_ai(req: AISearchRequest) -> Dict[str, Any]:
+    con = connect(DB_PATH)
+    try:
+        res = agent_search(
+            con,
+            user_prompt=req.prompt,
+            max_iters=req.ai_iters,
+            limit=req.limit,
+        )
+        return {
+            "ai": True,
+            "llm_used": res.get("llm_used", False),
+            "plan": res.get("plan"),
+            "history": res.get("history"),
+            "postfilter": res.get("postfilter"),
+            "items": res.get("items", []),
+            "count": int(res.get("count", 0)),
+        }
+    finally:
+        con.close()
diff --git a/bundle_export.py b/bundle_export.py
new file mode 100644
index 0000000..b06b41d
--- /dev/null
+++ b/bundle_export.py
@@ -0,0 +1,267 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+import shutil
+import sqlite3
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+# NEW: PDF merge helper (pypdf)
+# pip install pypdf
+try:
+    from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
+except Exception:  # чтобы не ломать bundle, если pypdf/модуль не поставили
+    merge_all_pdfs_in_dir = None
+
+
+def _slug(s: str, max_len: int = 60) -> str:
+    s = (s or "").strip()
+    if not s:
+        return "candidate"
+    s = re.sub(r"\s+", " ", s)
+    s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
+    s = s.replace(" ", "_")
+    s = re.sub(r"_+", "_", s).strip("_")
+    if not s:
+        return "candidate"
+    return s[:max_len]
+
+
+def _safe_mkdir(p: Path) -> None:
+    p.mkdir(parents=True, exist_ok=True)
+
+
+def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
+    """
+    Возвращает список самых приоритетных путей к файлу резюме.
+    1) resumes.file_path
+    2) sources.original_file_path
+    3) некоторые варианты путей из sources.extra_json
+    """
+    paths: List[str] = []
+
+    row = con.execute(
+        "SELECT file_path FROM resumes WHERE resume_id=?",
+        (resume_id,),
+    ).fetchone()
+    if row and row["file_path"]:
+        paths.append(str(row["file_path"]))
+
+    cur = con.execute(
+        """SELECT original_file_path, original_file_name, extra_json
+           FROM sources
+           WHERE resume_id=?""",
+        (resume_id,),
+    )
+    for r in cur.fetchall():
+        ofp = r["original_file_path"]
+        if ofp:
+            paths.append(str(ofp))
+
+        try:
+            extra = json.loads(r["extra_json"] or "{}")
+            if isinstance(extra, dict):
+                for k in ("file_path", "path", "local_path", "source_path"):
+                    if extra.get(k):
+                        paths.append(str(extra[k]))
+        except Exception:
+            pass
+
+    # дедуп
+    seen = set()
+    out: List[str] = []
+    for p in paths:
+        p2 = os.path.normpath(p)
+        if p2 in seen:
+            continue
+        seen.add(p2)
+        out.append(p2)
+    return out
+
+
+def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
+    ext = src.suffix.lower() if src.suffix else ""
+    candidate = f"{base_name}{ext}"
+    dst = dst_dir / candidate
+
+    if dst.exists():
+        i = 2
+        while True:
+            dst = dst_dir / f"{base_name}({i}){ext}"
+            if not dst.exists():
+                break
+            i += 1
+
+    shutil.copy2(src, dst)
+    return dst
+
+
+def bundle_search_results(
+    con: sqlite3.Connection,
+    results: Iterable[Dict[str, Any]],
+    out_dir: str,
+    *,
+    copy_files: bool = True,
+    merge_text: bool = True,
+    merge_pdf: bool = True,          # NEW
+) -> Dict[str, Any]:
+    """
+    results: iterable dictов где есть минимум:
+      - resume_id
+      - candidate_id
+      - name (желательно)
+
+    Создаёт:
+      - files/: скопированные исходные файлы резюме
+      - merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
+      - pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
+      - manifest.json
+      - README.txt
+    """
+    out_root = Path(out_dir).resolve()
+    files_dir = out_root / "files"
+    _safe_mkdir(files_dir)
+
+    manifest: List[Dict[str, Any]] = []
+    copied = 0
+    missing = 0
+
+    merged_parts: List[str] = []
+    merged_txt_path = out_root / "merged_resumes.txt"
+
+    for item in results:
+        resume_id = item.get("resume_id")
+        cand_id = item.get("candidate_id")
+        name = item.get("name") or ""
+        if not resume_id or not cand_id:
+            continue
+
+        # merged TXT из БД
+        if merge_text:
+            row = con.execute(
+                "SELECT clean_text FROM resumes WHERE resume_id=?",
+                (resume_id,),
+            ).fetchone()
+            clean_text = (row["clean_text"] if row else "") or ""
+            header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
+            merged_parts.append(header)
+            merged_parts.append(clean_text.strip())
+            merged_parts.append("")
+
+        if not copy_files:
+            continue
+
+        src_paths = _pick_source_paths(con, resume_id)
+
+        src_found: Optional[Path] = None
+        for sp in src_paths:
+            p = Path(sp)
+            if p.exists() and p.is_file():
+                src_found = p
+                break
+
+        if not src_found:
+            missing += 1
+            manifest.append(
+                {
+                    "candidate_id": cand_id,
+                    "name": name,
+                    "resume_id": resume_id,
+                    "copied": False,
+                    "reason": "source_file_not_found",
+                    "tried_paths": src_paths,
+                }
+            )
+            continue
+
+        base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
+        try:
+            dst = _copy_unique(src_found, files_dir, base)
+            copied += 1
+            manifest.append(
+                {
+                    "candidate_id": cand_id,
+                    "name": name,
+                    "resume_id": resume_id,
+                    "copied": True,
+                    "source_path": str(src_found),
+                    "dest_path": str(dst),
+                }
+            )
+        except Exception as e:
+            missing += 1
+            manifest.append(
+                {
+                    "candidate_id": cand_id,
+                    "name": name,
+                    "resume_id": resume_id,
+                    "copied": False,
+                    "reason": f"copy_failed: {repr(e)}",
+                    "source_path": str(src_found),
+                }
+            )
+
+    # merged TXT
+    if merge_text:
+        merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
+
+    # NEW: merged PDF from files/*.pdf
+    merged_pdf_path: Optional[Path] = None
+    pdf_info: Optional[Dict[str, Any]] = None
+    if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
+        try:
+            merged_pdf_path = out_root / "pdf" / "merged.pdf"
+            _safe_mkdir(merged_pdf_path.parent)
+            pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
+        except Exception as e:
+            pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
+
+    # manifest.json
+    (out_root / "manifest.json").write_text(
+        json.dumps(
+            {
+                "out_dir": str(out_root),
+                "copied_files": copied,
+                "missing_files": missing,
+                "merged_text": str(merged_txt_path) if merge_text else None,
+                "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
+                "pdf_info": pdf_info,
+                "items": manifest,
+            },
+            ensure_ascii=False,
+            indent=2,
+        ),
+        encoding="utf-8",
+        errors="ignore",
+    )
+
+    # README
+    readme_lines = [
+        "Папка создана командой search.",
+        "- files/: скопированные исходные файлы резюме",
+        "- merged_resumes.txt: склейка текста clean_text из БД",
+        "- manifest.json: что откуда скопировалось / что не найдено",
+    ]
+    if merge_pdf:
+        if merge_all_pdfs_in_dir is None:
+            readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
+        else:
+            readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
+
+    (out_root / "README.txt").write_text(
+        "\n".join(readme_lines) + "\n",
+        encoding="utf-8",
+        errors="ignore",
+    )
+
+    return {
+        "out_dir": str(out_root),
+        "copied_files": copied,
+        "missing_files": missing,
+        "merged_text": str(merged_txt_path) if merge_text else None,
+        "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
+        "manifest": str(out_root / "manifest.json"),
+        "pdf_info": pdf_info,
+    }
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..b798673
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,282 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from typing import Any, Dict
+from pathlib import Path
+
+import os
+
+from tg_resume_db.bundle_export import bundle_search_results
+from tg_resume_db.db import connect, init_db
+from tg_resume_db.pipeline import import_exports as run_import
+from tg_resume_db.search import search as run_search
+from tg_resume_db.util import Logger
+from tg_resume_db.extract.text_extract import extract_text as extract_text_generic
+from tg_resume_db.extract.pdf_extract import extract_pdf_best
+from tg_resume_db.extract.clean import normalize_text
+from tg_resume_db.extract.doc_type import detect_doc_type
+from tg_resume_db.extract.sections import split_sections, sections_present
+from tg_resume_db.extract.parse import extract_name_guess
+
+
+def _print_json(obj: Dict[str, Any]) -> None:
+    s = json.dumps(obj, ensure_ascii=False, indent=2)
+    try:
+        print(s)
+    except UnicodeEncodeError:
+        # Fallback for cp1251/legacy consoles.
+        print(s.encode("ascii", "backslashreplace").decode("ascii"))
+
+
+def _is_interactive() -> bool:
+    return sys.stdin.isatty() and sys.stdout.isatty()
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(prog="tg_resume_db")
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    # ---------------- import_exports ----------------
+    imp = sub.add_parser("import_exports", help="Import Telegram exports recursively (incremental)")
+    imp.add_argument("--input", required=True, help="Path to exports directory")
+    imp.add_argument("--db", required=True, help="SQLite db path")
+    imp.add_argument("--log", default="./import.log", help="Log file path")
+    imp.add_argument("--near-dist", type=int, default=6, help="Simhash max Hamming distance for near-duplicates")
+    imp.add_argument("--min-text-len", type=int, default=250, help="Skip very short texts")
+    imp.add_argument(
+        "--llm",
+        choices=["auto", "off", "force"],
+        default="auto",
+        help="LLM enrichment mode: auto (default), off to disable, force to always run when configured",
+    )
+    imp.add_argument(
+        "--llm-review",
+        choices=["always", "auto", "off"],
+        default="always",
+        help="LLM review mode for parsed JSON: always (default), auto, off",
+    )
+    imp.add_argument(
+        "--llm-review-rounds",
+        type=int,
+        default=1,
+        help="How many LLM review merge rounds to run per resume (1..3)",
+    )
+
+    # ---------------- search ----------------
+    s = sub.add_parser("search", help="Search candidates")
+    s.add_argument("--db", required=True)
+    s.add_argument("--query", required=True)
+    s.add_argument("--limit", type=int, default=20)
+    s.add_argument("--offset", type=int, default=0)
+    s.add_argument("--remote", choices=["true", "false"], default=None)
+    s.add_argument("--location", default=None)
+    s.add_argument("--experience-min", type=float, default=None)
+    s.add_argument("--salary-min", type=int, default=None)
+    s.add_argument("--salary-max", type=int, default=None)
+    s.add_argument("--english", default=None)
+    s.add_argument("--doc-type", default=None)
+
+    # AI mode
+    s.add_argument("--ai", action="store_true", help="Use LLM to build filters from text query and run search")
+    s.add_argument("--ai-iters", type=int, default=2, help="How many refine iterations for AI search")
+
+    # Backward compatible single-value filters
+    s.add_argument("--role", default=None, help="Single role (backward compatible); prefer --roles-any")
+    s.add_argument("--skill", default=None, help="Single skill (backward compatible); prefer --skills-any/--skills-all")
+
+    # Stack filters (comma-separated)
+    s.add_argument("--roles-any", default=None, help="Comma-separated roles; at least one must match")
+    s.add_argument("--skills-any", default=None, help="Comma-separated skills; at least one must match")
+    s.add_argument("--skills-all", default=None, help="Comma-separated skills; all must match")
+
+    # Bundle export behavior
+    s.add_argument("--bundle", choices=["ask", "yes", "no"], default="ask", help="Bundle found resumes into a folder")
+
+    # ---------------- inspect ----------------
+    ins = sub.add_parser("inspect", help="Inspect a single resume file (doc_type/sections)")
+    ins.add_argument("--file", required=True, help="Path to resume file")
+
+    args = ap.parse_args()
+
+    # ========================= import_exports =========================
+    if args.cmd == "import_exports":
+        con = connect(args.db)
+        try:
+            init_db(con)
+            log = Logger(args.log)
+
+            prev_enabled = os.environ.get("LLM_PARSE_ENABLED")
+            prev_force = os.environ.get("LLM_PARSE_FORCE")
+            prev_review_mode = os.environ.get("LLM_PARSE_REVIEW_MODE")
+            prev_review_rounds = os.environ.get("LLM_PARSE_REVIEW_ROUNDS")
+            try:
+                if args.llm == "off":
+                    os.environ["LLM_PARSE_ENABLED"] = "0"
+                    os.environ["LLM_PARSE_REVIEW_MODE"] = "off"
+                elif args.llm == "force":
+                    os.environ["LLM_PARSE_ENABLED"] = "1"
+                    os.environ["LLM_PARSE_FORCE"] = "1"
+                    os.environ["LLM_PARSE_REVIEW_MODE"] = "always"
+                else:
+                    os.environ["LLM_PARSE_REVIEW_MODE"] = args.llm_review
+
+                rounds = max(1, min(int(args.llm_review_rounds), 3))
+                os.environ["LLM_PARSE_REVIEW_ROUNDS"] = str(rounds)
+                stats = run_import(
+                    con=con,
+                    input_dir=args.input,
+                    log=log,
+                    max_near_dist=args.near_dist,
+                    min_text_len=args.min_text_len,
+                )
+            finally:
+                if args.llm == "off":
+                    if prev_enabled is None:
+                        os.environ.pop("LLM_PARSE_ENABLED", None)
+                    else:
+                        os.environ["LLM_PARSE_ENABLED"] = prev_enabled
+                elif args.llm == "force":
+                    if prev_enabled is None:
+                        os.environ.pop("LLM_PARSE_ENABLED", None)
+                    else:
+                        os.environ["LLM_PARSE_ENABLED"] = prev_enabled
+                    if prev_force is None:
+                        os.environ.pop("LLM_PARSE_FORCE", None)
+                    else:
+                        os.environ["LLM_PARSE_FORCE"] = prev_force
+                if prev_review_mode is None:
+                    os.environ.pop("LLM_PARSE_REVIEW_MODE", None)
+                else:
+                    os.environ["LLM_PARSE_REVIEW_MODE"] = prev_review_mode
+                if prev_review_rounds is None:
+                    os.environ.pop("LLM_PARSE_REVIEW_ROUNDS", None)
+                else:
+                    os.environ["LLM_PARSE_REVIEW_ROUNDS"] = prev_review_rounds
+        finally:
+            con.close()
+
+        _print_json(stats)
+        return
+
+    # ============================= search =============================
+    if args.cmd == "search":
+        con = connect(args.db)
+        init_db(con)  # важно: гарантирует, что resumes_fts и триггеры существуют
+
+        try:
+            items: list[Dict[str, Any]] = []
+            out: Dict[str, Any] = {}
+
+            if args.ai:
+                from tg_resume_db.agent import agent_search
+
+                res = agent_search(
+                    con,
+                    user_prompt=args.query,
+                    max_iters=args.ai_iters,
+                )
+
+                items = res.get("items", [])
+                out = {
+                    "ai": True,
+                    "llm_used": res.get("llm_used", False),
+                    "plan": res.get("plan"),
+                    "history": res.get("history"),
+                    "postfilter": res.get("postfilter"),
+                    "items": items,
+                    "count": res.get("count", len(items)),
+                }
+            else:
+                filters = {
+                    "remote": (args.remote == "true") if args.remote is not None else None,
+                    "location": args.location,
+                    "experience_min": args.experience_min,
+                    "salary_min": args.salary_min,
+                    "salary_max": args.salary_max,
+                    "english": args.english,
+                    "doc_type": args.doc_type,
+                    # backward compat
+                    "role": args.role,
+                    "skill": args.skill,
+                    # new
+                    "roles_any": args.roles_any,
+                    "skills_any": args.skills_any,
+                    "skills_all": args.skills_all,
+                }
+
+                items = run_search(
+                    con,
+                    query=args.query,
+                    filters=filters,
+                    limit=args.limit,
+                    offset=args.offset,
+                )
+                out = {"ai": False, "items": items, "count": len(items)}
+
+            # 1) печатаем результаты
+            _print_json(out)
+
+            # 2) bundle prompt/flag
+            if args.bundle == "yes":
+                do_bundle = True
+            elif args.bundle == "no":
+                do_bundle = False
+            else:  # ask
+                do_bundle = False
+                if _is_interactive():
+                    ans = input("\nСобрать найденные резюме в папку? (Y/N): ").strip().lower()
+                    do_bundle = ans in ("y", "yes", "да", "д")
+
+            if do_bundle:
+                ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+                out_dir = f"./bundle_{ts}"
+                info = bundle_search_results(con, items, out_dir, copy_files=True, merge_text=True)
+                print(f"\n[done] Готово: {info['out_dir']}")
+                print(f"   files copied: {info['copied_files']}, missing: {info['missing_files']}")
+                print(f"   merged: {info['merged_text']}")
+                print(f"   manifest: {info['manifest']}")
+
+            return
+
+        finally:
+            con.close()
+
+    # ============================= inspect =============================
+    if args.cmd == "inspect":
+        fp = args.file
+        path = Path(fp)
+        extract_meta = {}
+        if path.suffix.lower() == ".pdf":
+            pdf_res = extract_pdf_best(path, timeout_sec=25)
+            raw_text = pdf_res.text
+            extract_meta = {
+                "method": pdf_res.method,
+                "quality_score": pdf_res.score,
+                "quality_flags": pdf_res.flags,
+                "pages": len(pdf_res.pages),
+            }
+        else:
+            raw_text = extract_text_generic(path)
+            extract_meta = {"method": "generic"}
+
+        clean = normalize_text(raw_text or "")
+        dt = detect_doc_type(clean, file_ext=Path(fp).suffix.lower())
+        secs = split_sections(clean, dt.doc_type)
+        out = {
+            "file": fp,
+            "doc_type": dt.doc_type,
+            "confidence": dt.confidence,
+            "signals": dt.signals,
+            "extract": extract_meta,
+            "sections_present": sections_present(secs),
+            "name_guess": extract_name_guess(clean),
+        }
+        _print_json(out)
+        return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/db.py b/db.py
new file mode 100644
index 0000000..1c7d3d9
--- /dev/null
+++ b/db.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+
+
+SCHEMA = r"""
+PRAGMA journal_mode=WAL;
+PRAGMA synchronous=NORMAL;
+PRAGMA temp_store=MEMORY;
+
+CREATE TABLE IF NOT EXISTS candidates (
+  candidate_id TEXT PRIMARY KEY,
+  name TEXT,
+  location TEXT,
+  remote INTEGER,
+  experience_years REAL,
+  experience_years_eng REAL, -- инженерный опыт (после фильтра HR)
+  experience_confidence REAL,
+  salary_min INTEGER,
+  salary_max INTEGER,
+  salary_confidence REAL,
+  english_level TEXT,
+  roles_json TEXT,
+  skills_json TEXT,
+  primary_languages_json TEXT,
+  backend_focus INTEGER,
+  roles_norm TEXT,   -- "|backend|devops|"
+  skills_norm TEXT,  -- "|python|k8s|"
+  created_at TEXT DEFAULT (datetime('now')),
+  updated_at TEXT DEFAULT (datetime('now'))
+);
+
+CREATE TABLE IF NOT EXISTS candidate_contacts (
+  contact_type TEXT NOT NULL,  -- email/phone/tg/github/linkedin/url
+  contact_value TEXT NOT NULL, -- normalized
+  candidate_id TEXT NOT NULL,
+  created_at TEXT DEFAULT (datetime('now')),
+  PRIMARY KEY(contact_type, contact_value),
+  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id);
+
+CREATE TABLE IF NOT EXISTS resumes (
+  resume_id TEXT PRIMARY KEY,
+  candidate_id TEXT NOT NULL,
+  sha256 TEXT,
+  simhash TEXT,
+  clean_text TEXT NOT NULL,
+  raw_text TEXT,
+  extraction_json TEXT,
+  llm_summary TEXT,
+  llm_tags_json TEXT,
+  extract_method TEXT,
+  extract_quality_score REAL,
+  extract_quality_flags TEXT,
+  extract_pages_json TEXT,
+  doc_type TEXT,
+  doc_type_confidence REAL,
+  parse_method TEXT,
+  parse_version TEXT,
+  sections_json TEXT,
+  is_active INTEGER DEFAULT 1,
+  duplicate_of_resume_id TEXT,
+  file_path TEXT,
+  file_mtime INTEGER,
+  file_size INTEGER,
+  created_at TEXT DEFAULT (datetime('now')),
+  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL;
+CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id);
+CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active);
+
+CREATE TABLE IF NOT EXISTS sources (
+  source_id INTEGER PRIMARY KEY AUTOINCREMENT,
+  resume_id TEXT NOT NULL,
+  export_path TEXT,
+  chat_title TEXT,
+  message_id TEXT,
+  message_date TEXT,
+  origin_type TEXT,
+  original_file_path TEXT,
+  original_file_name TEXT,
+  extra_json TEXT,
+  created_at TEXT DEFAULT (datetime('now')),
+  FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
+);
+
+CREATE TABLE IF NOT EXISTS files_seen (
+  sha256 TEXT PRIMARY KEY,
+  size INTEGER,
+  mtime INTEGER,
+  canonical_resume_id TEXT,
+  first_seen_at TEXT DEFAULT (datetime('now')),
+  last_seen_at TEXT DEFAULT (datetime('now'))
+);
+
+CREATE TABLE IF NOT EXISTS simhash_buckets (
+  bucket INTEGER NOT NULL,
+  band INTEGER NOT NULL,
+  resume_id TEXT NOT NULL,
+  PRIMARY KEY(bucket, band, resume_id),
+  FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
+);
+
+CREATE TABLE IF NOT EXISTS candidate_skills (
+  candidate_id TEXT NOT NULL,
+  skill_id TEXT NOT NULL,
+  skill_label TEXT,
+  confidence REAL,
+  source TEXT,
+  evidence TEXT,
+  created_at TEXT DEFAULT (datetime('now')),
+  PRIMARY KEY(candidate_id, skill_id),
+  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
+);
+
+CREATE TABLE IF NOT EXISTS candidate_roles (
+  candidate_id TEXT NOT NULL,
+  role TEXT NOT NULL,
+  confidence REAL,
+  source TEXT,
+  evidence TEXT,
+  created_at TEXT DEFAULT (datetime('now')),
+  PRIMARY KEY(candidate_id, role),
+  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
+);
+
+CREATE TABLE IF NOT EXISTS candidate_languages (
+  candidate_id TEXT NOT NULL,
+  language TEXT NOT NULL,
+  level TEXT,
+  confidence REAL,
+  source TEXT,
+  evidence TEXT,
+  created_at TEXT DEFAULT (datetime('now')),
+  PRIMARY KEY(candidate_id, language),
+  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
+);
+
+CREATE TABLE IF NOT EXISTS positions (
+  position_id TEXT PRIMARY KEY,
+  resume_id TEXT NOT NULL,
+  candidate_id TEXT NOT NULL,
+  title TEXT,
+  company TEXT,
+  date_from TEXT,
+  date_to TEXT,
+  is_current INTEGER,
+  description TEXT,
+  stack_json TEXT,
+  created_at TEXT DEFAULT (datetime('now')),
+  FOREIGN KEY(resume_id) REFERENCES resumes(resume_id),
+  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
+);
+
+CREATE TABLE IF NOT EXISTS llm_cache (
+  cache_key TEXT PRIMARY KEY,
+  model TEXT,
+  result_json TEXT,
+  created_at TEXT DEFAULT (datetime('now'))
+);
+
+-- Full-text index (FTS5): contentless
+CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5(
+  resume_id UNINDEXED,
+  candidate_id UNINDEXED,
+  clean_text,
+  tokenize='unicode61 remove_diacritics 2'
+);
+
+-- --- Triggers to keep FTS synced with resumes ---
+-- Insert
+CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts
+AFTER INSERT ON resumes
+BEGIN
+  DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
+  INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
+  SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
+  WHERE NEW.is_active = 1;
+END;
+
+-- Delete
+CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts
+AFTER DELETE ON resumes
+BEGIN
+  DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id;
+END;
+
+-- Update (text/active/candidate)
+CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts
+AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes
+BEGIN
+  DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
+  INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
+  SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
+  WHERE NEW.is_active = 1;
+END;
+"""
+
+
+def connect(db_path: str) -> sqlite3.Connection:
+    Path(db_path).parent.mkdir(parents=True, exist_ok=True)
+    con = sqlite3.connect(db_path)
+    con.row_factory = sqlite3.Row
+    return con
+
+
+def _table_exists(con: sqlite3.Connection, name: str) -> bool:
+    row = con.execute(
+        "SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1",
+        (name,),
+    ).fetchone()
+    return row is not None
+
+
+def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool:
+    cur = con.execute(f"PRAGMA table_info({table})")
+    for r in cur.fetchall():
+        if r["name"] == column:
+            return True
+    return False
+
+
+def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None:
+    if not _table_exists(con, table):
+        return
+    if _column_exists(con, table, column):
+        return
+    con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}")
+
+
+def _ensure_fts_backfilled(con: sqlite3.Connection) -> None:
+    """
+    Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes.
+    Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0.
+    """
+    if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"):
+        return
+
+    try:
+        resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"])
+        fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"])
+    except Exception:
+        return
+
+    if resumes_cnt <= 0:
+        return
+
+    # Любое несовпадение -> rebuild (убирает и пустоту, и дубли)
+    if fts_cnt != resumes_cnt:
+        con.execute("DELETE FROM resumes_fts")
+        con.execute(
+            """
+            INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
+            SELECT resume_id, candidate_id, clean_text
+            FROM resumes
+            WHERE is_active=1
+            """
+        )
+        con.commit()
+
+
+def init_db(con: sqlite3.Connection) -> None:
+    con.executescript(SCHEMA)
+    # Lightweight migrations for existing DBs (safe to re-run)
+    _add_column_if_missing(con, "candidates", "experience_years_eng", "REAL")
+    _add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT")
+    _add_column_if_missing(con, "candidates", "backend_focus", "INTEGER")
+    _add_column_if_missing(con, "resumes", "llm_summary", "TEXT")
+    _add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT")
+    _add_column_if_missing(con, "resumes", "extract_method", "TEXT")
+    _add_column_if_missing(con, "resumes", "extract_quality_score", "REAL")
+    _add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT")
+    _add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT")
+    _add_column_if_missing(con, "resumes", "doc_type", "TEXT")
+    _add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL")
+    _add_column_if_missing(con, "resumes", "parse_method", "TEXT")
+    _add_column_if_missing(con, "resumes", "parse_version", "TEXT")
+    _add_column_if_missing(con, "resumes", "sections_json", "TEXT")
+    if not _table_exists(con, "llm_cache"):
+        con.execute(
+            """
+            CREATE TABLE IF NOT EXISTS llm_cache (
+              cache_key TEXT PRIMARY KEY,
+              model TEXT,
+              result_json TEXT,
+              created_at TEXT DEFAULT (datetime('now'))
+            )
+            """
+        )
+    con.commit()
+    _ensure_fts_backfilled(con)
diff --git a/dedup/simhash.py b/dedup/simhash.py
new file mode 100644
index 0000000..be8a643
--- /dev/null
+++ b/dedup/simhash.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+import hashlib
+import re
+from typing import List, Tuple
+
+def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str:
+    h = hashlib.sha256()
+    for chunk in iter(lambda: f.read(chunk_size), b""):
+        h.update(chunk)
+    return h.hexdigest()
+
+def sha256_file(path) -> str:
+    with open(path, "rb") as f:
+        return sha256_file_bytes_iter(f)
+
+def sha1_str(s: str) -> str:
+    return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()
+
+def simhash64(text: str) -> int:
+    tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower())
+    if not tokens:
+        return 0
+    v = [0] * 64
+    for tok in tokens:
+        h = hashlib.md5(tok.encode("utf-8")).digest()
+        x = int.from_bytes(h[:8], "big", signed=False)
+        for i in range(64):
+            v[i] += 1 if ((x >> i) & 1) else -1
+    out = 0
+    for i in range(64):
+        if v[i] > 0:
+            out |= (1 << i)
+    return out
+
+def hamming64(a: int, b: int) -> int:
+    return (a ^ b).bit_count()
+
+def simhash_bands(x: int) -> List[Tuple[int, int]]:
+    # 4 bands x 16 bits
+    return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]
diff --git a/extract/clean.py b/extract/clean.py
new file mode 100644
index 0000000..110bb04
--- /dev/null
+++ b/extract/clean.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import re
+from collections import Counter
+import unicodedata
+
+RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I)
+RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U)
+RE_MULTI_SPACE = re.compile(r"[ \t]+")
+RE_MULTI_NL = re.compile(r"\n{3,}")
+
+_INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"]
+_BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]")
+
+def normalize_text(raw: str) -> str:
+    text = raw.replace("\r\n", "\n").replace("\r", "\n")
+    for ch in _INVISIBLE_CHARS:
+        text = text.replace(ch, "")
+    text = _BIDI_CTRL_RE.sub("", text)
+    # remove most control/format chars but keep line breaks and tabs
+    text = "".join(
+        ch for ch in text
+        if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C")
+    )
+    text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b"))
+    lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")]
+    lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)]
+    counts = Counter(lines)
+    filtered = []
+    for ln in lines:
+        if counts[ln] >= 4 and len(ln) <= 90:
+            continue
+        filtered.append(ln)
+    text = "\n".join(filtered)
+    text = RE_MULTI_NL.sub("\n\n", text).strip()
+    return text
+
+def to_fts_text(clean: str) -> str:
+    return re.sub(r"\s+", " ", clean).strip()
diff --git a/extract/doc_type.py b/extract/doc_type.py
new file mode 100644
index 0000000..7cf701d
--- /dev/null
+++ b/extract/doc_type.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+@dataclass
+class DocTypeResult:
+    doc_type: str
+    confidence: float
+    signals: List[str]
+
+
+_HH_PATTERNS = [
+    (re.compile(r"\bhh\.ru\b", re.I), 2.0, "hh_domain"),
+    (re.compile(r"\bheadhunter\b", re.I), 2.0, "headhunter"),
+    (re.compile(r"\bрезюме\s+обновлено\b", re.I), 2.5, "hh_resume_updated"),
+    (re.compile(r"\bжелаемая\s+должность\b", re.I), 1.2, "hh_desired_role"),
+    (re.compile(r"\bключевые\s+навыки\b", re.I), 1.0, "hh_key_skills"),
+    (re.compile(r"\bопыт\s+работы\b", re.I), 0.8, "hh_experience"),
+]
+
+_LI_PATTERNS = [
+    (re.compile(r"\blinkedin\b", re.I), 2.5, "li_brand"),
+    (re.compile(r"\blinkedin\.com\b", re.I), 2.0, "li_domain"),
+    (re.compile(r"\bexperience\b", re.I), 0.9, "li_experience"),
+    (re.compile(r"\beducation\b", re.I), 0.9, "li_education"),
+    (re.compile(r"\bskills\b", re.I), 0.9, "li_skills"),
+    (re.compile(r"\babout\b", re.I), 0.6, "li_about"),
+]
+
+_PPTX_PATTERNS = [
+    (re.compile(r"\bslide\b", re.I), 1.0, "pptx_slide"),
+    (re.compile(r"\bpowerpoint\b", re.I), 1.3, "pptx_powerpoint"),
+    (re.compile(r"\bpptx\b", re.I), 1.3, "pptx_ext"),
+    (re.compile(r"\bpresentation\b", re.I), 0.8, "pptx_presentation"),
+]
+
+
+def _score_patterns(text: str, patterns) -> tuple[float, List[str]]:
+    score = 0.0
+    signals: List[str] = []
+    for rx, weight, name in patterns:
+        if rx.search(text):
+            score += weight
+            signals.append(name)
+    return score, signals
+
+
+def _confidence_from_score(score: float) -> float:
+    if score >= 4.0:
+        return 0.92
+    if score >= 3.0:
+        return 0.85
+    if score >= 2.0:
+        return 0.75
+    if score >= 1.2:
+        return 0.62
+    if score > 0.0:
+        return 0.50
+    return 0.30
+
+
+def detect_doc_type(clean_text: str, file_ext: Optional[str] = None) -> DocTypeResult:
+    lines = [ln.strip() for ln in (clean_text or "").splitlines() if ln.strip()]
+    head_lines = lines[:80]
+    head_text = "\n".join(head_lines)
+    head_lc = head_text.lower()
+
+    signals: List[str] = []
+
+    hh_score, hh_signals = _score_patterns(head_text, _HH_PATTERNS)
+    li_score, li_signals = _score_patterns(head_text, _LI_PATTERNS)
+    pptx_score, pptx_signals = _score_patterns(head_text, _PPTX_PATTERNS)
+    if file_ext and file_ext.lower() in (".pptx", ".ppt"):
+        pptx_score += 2.0
+        signals.append("pptx_ext")
+
+    signals.extend(hh_signals + li_signals + pptx_signals)
+
+    # One-page heuristic: short resumes with dense content
+    total_len = len(clean_text or "")
+    one_page_score = 0.0
+    if len(lines) <= 70 and total_len <= 4500:
+        one_page_score = 2.2
+        signals.append("one_page_short")
+    elif len(lines) <= 90 and total_len <= 6500:
+        one_page_score = 1.6
+        signals.append("one_page_medium")
+
+    # Scan heuristic: very low textual content
+    letters = sum(ch.isalpha() for ch in clean_text or "")
+    total = max(1, len(clean_text or ""))
+    letter_ratio = letters / total
+    scan_score = 0.0
+    if total_len < 200 or letter_ratio < 0.12:
+        scan_score = 3.2
+        signals.append("scan_low_text")
+        if file_ext and file_ext.lower() in (".pdf", ".png", ".jpg", ".jpeg", ".tiff"):
+            scan_score += 0.6
+            signals.append("scan_file_ext")
+
+    candidates = [
+        ("hh_ru", hh_score),
+        ("linkedin_pdf", li_score),
+        ("pptx_export", pptx_score),
+        ("one_page", one_page_score),
+        ("scan_pdf", scan_score),
+    ]
+    doc_type, best_score = max(candidates, key=lambda x: x[1])
+
+    if best_score <= 0.0:
+        base = "generic_pdf" if (file_ext and file_ext.lower() == ".pdf") else "generic"
+        return DocTypeResult(doc_type=base, confidence=0.35, signals=signals)
+
+    confidence = _confidence_from_score(best_score)
+    # If scan is detected strongly, prefer it
+    if doc_type == "scan_pdf" and confidence >= 0.8:
+        return DocTypeResult(doc_type="scan_pdf", confidence=confidence, signals=signals)
+
+    # Split one-page into ru/en
+    if doc_type == "one_page":
+        if _looks_cyrillic(head_text):
+            return DocTypeResult(doc_type="one_page_ru", confidence=confidence, signals=signals)
+        return DocTypeResult(doc_type="one_page_en", confidence=confidence, signals=signals)
+
+    return DocTypeResult(doc_type=doc_type, confidence=confidence, signals=signals)
+
+
+def _looks_cyrillic(text: str) -> bool:
+    cyr = len(re.findall(r"[А-Яа-яЁё]", text))
+    lat = len(re.findall(r"[A-Za-z]", text))
+    return cyr > lat and cyr >= 10
diff --git a/extract/experience.py b/extract/experience.py
new file mode 100644
index 0000000..964db09
--- /dev/null
+++ b/extract/experience.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from datetime import date
+from typing import Dict, List, Optional, Tuple
+
+# Month maps (EN + RU)
+MONTHS = {
+    "jan": 1, "january": 1, "янв": 1, "январ": 1,
+    "feb": 2, "february": 2, "фев": 2, "феврал": 2,
+    "mar": 3, "march": 3, "мар": 3, "март": 3,
+    "apr": 4, "april": 4, "апр": 4, "апрел": 4,
+    "may": 5, "май": 5,
+    "jun": 6, "june": 6, "июн": 6, "июнь": 6,
+    "jul": 7, "july": 7, "июл": 7, "июль": 7,
+    "aug": 8, "august": 8, "авг": 8, "август": 8,
+    "sep": 9, "september": 9, "сен": 9, "сент": 9,
+    "oct": 10, "october": 10, "окт": 10, "октя": 10,
+    "nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
+    "dec": 12, "december": 12, "дек": 12, "дека": 12,
+}
+
+PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
+
+# Direct "X years" patterns
+DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
+
+# Dates like 03.2019, 2019, Jan 2020, янв 2020
+MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
+YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
+MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
+
+# Range separators
+RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
+
+@dataclass
+class ExpResult:
+    years: Optional[float]
+    confidence: float
+    debug: Dict
+
+def _clamp_years(y: float) -> Optional[float]:
+    if 0.0 <= y <= 45.0:
+        return y
+    return None
+
+def _parse_mon(mon: str) -> Optional[int]:
+    m = mon.strip().lower()
+    m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
+    # allow prefixes: "январ", "феврал"
+    for k, v in MONTHS.items():
+        if m.startswith(k):
+            return v
+    return None
+
+def _as_ymd(y: int, m: int) -> date:
+    return date(y, m, 1)
+
+def _parse_one_date(s: str) -> Optional[date]:
+    s = s.strip()
+    if PRESENT_RE.search(s):
+        today = date.today()
+        return date(today.year, today.month, 1)
+
+    m1 = MMYYYY_RE.search(s)
+    if m1:
+        mm = int(m1.group(1))
+        yy = int(m1.group(2))
+        return _as_ymd(yy, mm)
+
+    m2 = MON_YYYY_RE.search(s)
+    if m2:
+        mon = _parse_mon(m2.group(1))
+        yy = int(m2.group(2))
+        if mon:
+            return _as_ymd(yy, mon)
+
+    m3 = YYYY_RE.search(s)
+    if m3:
+        yy = int(m3.group(1))
+        return _as_ymd(yy, 1)
+
+    return None
+
+def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
+    if not intervals:
+        return []
+    intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
+    merged = [intervals[0]]
+    for s, e in intervals[1:]:
+        ls, le = merged[-1]
+        if s <= le:
+            merged[-1] = (ls, max(le, e))
+        else:
+            merged.append((s, e))
+    return merged
+
+def _months_between(a: date, b: date) -> int:
+    # month-level difference (inclusive-ish): b >= a
+    return (b.year - a.year) * 12 + (b.month - a.month)
+
+def extract_experience(text: str) -> ExpResult:
+    debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
+
+    # 1) Direct years
+    directs = []
+    for m in DIRECT_YEARS_RE.finditer(text):
+        try:
+            v = float(m.group(1).replace(",", "."))
+            if 0 <= v <= 45:
+                directs.append(v)
+                debug["direct_matches"].append({"match": m.group(0), "value": v})
+        except Exception:
+            pass
+    if directs:
+        years = _clamp_years(max(directs))
+        return ExpResult(years=years, confidence=0.90, debug=debug)
+
+    # 2) Ranges in lines: try to detect "start - end"
+    intervals: List[Tuple[date, date]] = []
+    for line in text.splitlines():
+        ln = line.strip()
+        if len(ln) < 7:
+            continue
+        # require range separator
+        if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
+            continue
+        rr = RANGE_RE.match(ln)
+        if not rr:
+            continue
+        a = rr.group("a")
+        b = rr.group("b")
+        da = _parse_one_date(a)
+        db = _parse_one_date(b)
+        if da and db:
+            if db < da:
+                da, db = db, da
+            # cap extremely old
+            if da.year < 1990:
+                continue
+            intervals.append((da, db))
+            debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
+
+    intervals = _merge_intervals(intervals)
+    debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
+
+    if not intervals:
+        return ExpResult(years=None, confidence=0.0, debug=debug)
+
+    total_months = 0
+    for s, e in intervals:
+        total_months += max(0, _months_between(s, e))
+    years = round(total_months / 12.0, 2)
+    years = _clamp_years(years) if years is not None else None
+
+    # confidence depends on amount of evidence
+    conf = 0.70 if total_months >= 12 else 0.55
+    return ExpResult(years=years, confidence=conf, debug=debug)
diff --git a/extract/experience_timeline.py b/extract/experience_timeline.py
new file mode 100644
index 0000000..ca73ae5
--- /dev/null
+++ b/extract/experience_timeline.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, asdict
+from datetime import date
+from typing import List, Optional
+
+MONTHS = {
+    "jan": 1, "january": 1, "янв": 1, "январ": 1,
+    "feb": 2, "february": 2, "фев": 2, "феврал": 2,
+    "mar": 3, "march": 3, "мар": 3, "март": 3,
+    "apr": 4, "april": 4, "апр": 4, "апрел": 4,
+    "may": 5, "май": 5,
+    "jun": 6, "june": 6, "июн": 6, "июнь": 6,
+    "jul": 7, "july": 7, "июл": 7, "июль": 7,
+    "aug": 8, "august": 8, "авг": 8, "август": 8,
+    "sep": 9, "september": 9, "сен": 9, "сент": 9,
+    "oct": 10, "october": 10, "окт": 10, "октя": 10,
+    "nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
+    "dec": 12, "december": 12, "дек": 12, "дека": 12,
+}
+
+PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
+MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
+YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
+MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
+RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
+YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—|–|-|to|по)\s*\d{4}\s*$", re.I)
+EDU_CONTEXT_RE = re.compile(
+    r"\b("
+    r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
+    r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
+    r")\b",
+    re.I,
+)
+
+
+@dataclass
+class Position:
+    title: Optional[str]
+    company: Optional[str]
+    date_from: Optional[str]
+    date_to: Optional[str]
+    is_current: Optional[bool]
+    description: Optional[str]
+
+
+def _parse_mon(mon: str) -> Optional[int]:
+    m = mon.strip().lower()
+    m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
+    for k, v in MONTHS.items():
+        if m.startswith(k):
+            return v
+    return None
+
+
+def _as_ymd(y: int, m: int) -> date:
+    return date(y, m, 1)
+
+
+def _parse_one_date(s: str) -> Optional[date]:
+    s = s.strip()
+    if PRESENT_RE.search(s):
+        today = date.today()
+        return date(today.year, today.month, 1)
+    m1 = MMYYYY_RE.search(s)
+    if m1:
+        mm = int(m1.group(1))
+        yy = int(m1.group(2))
+        return _as_ymd(yy, mm)
+    m2 = MON_YYYY_RE.search(s)
+    if m2:
+        mon = _parse_mon(m2.group(1))
+        yy = int(m2.group(2))
+        if mon:
+            return _as_ymd(yy, mon)
+    m3 = YYYY_RE.search(s)
+    if m3:
+        yy = int(m3.group(1))
+        return _as_ymd(yy, 1)
+    return None
+
+
+def extract_positions(text: str, max_items: int = 40) -> List[Position]:
+    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
+    positions: List[Position] = []
+    i = 0
+    while i < len(lines) and len(positions) < max_items:
+        ln = lines[i]
+        if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
+            i += 1
+            continue
+        rr = RANGE_RE.match(ln)
+        if not rr:
+            i += 1
+            continue
+        ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
+        if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
+            i += 1
+            continue
+        da = _parse_one_date(rr.group("a"))
+        db = _parse_one_date(rr.group("b"))
+        if not da or not db:
+            i += 1
+            continue
+        if da.year < 1990:
+            i += 1
+            continue
+        is_current = PRESENT_RE.search(rr.group("b")) is not None
+        title = None
+        company = None
+        desc_lines: List[str] = []
+        if i + 1 < len(lines):
+            if EDU_CONTEXT_RE.search(lines[i + 1]):
+                i += 1
+                continue
+            header = lines[i + 1]
+            parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
+            if parts:
+                title = parts[0]
+                if len(parts) > 1:
+                    company = parts[1]
+        j = i + 2
+        while j < len(lines):
+            if any(x in lines[j] for x in ("—", "–", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
+                break
+            desc_lines.append(lines[j])
+            j += 1
+        positions.append(
+            Position(
+                title=title,
+                company=company,
+                date_from=da.isoformat(),
+                date_to=db.isoformat(),
+                is_current=is_current,
+                description="\n".join(desc_lines).strip() if desc_lines else None,
+            )
+        )
+        i = j
+    return positions
+
+
+def positions_to_dicts(items: List[Position]) -> List[dict]:
+    return [asdict(p) for p in items]
diff --git a/extract/llm.py b/extract/llm.py
new file mode 100644
index 0000000..8a5a1bf
--- /dev/null
+++ b/extract/llm.py
@@ -0,0 +1,585 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import re
+import sqlite3
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+try:
+    import httpx  # type: ignore
+except Exception:  # pragma: no cover
+    httpx = None  # type: ignore
+
+
+def resolve_llm_runtime() -> Dict[str, str]:
+    """
+    Resolve OpenAI-compatible runtime config.
+    Supports both generic vars and Mistral aliases:
+      - generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
+      - mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
+    """
+    provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
+    base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
+    model = (os.environ.get("LLM_MODEL") or "").strip()
+    api_key = (os.environ.get("LLM_API_KEY") or "").strip()
+
+    mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
+    mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
+    mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()
+
+    if not api_key and mistral_key:
+        api_key = mistral_key
+    if not model and mistral_model:
+        model = mistral_model
+    if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
+        base_url = mistral_base
+
+    if base_url:
+        base_url = base_url.rstrip("/")
+
+    if not provider:
+        if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
+            provider = "mistral"
+        else:
+            provider = "generic"
+
+    return {
+        "provider": provider,
+        "base_url": base_url,
+        "model": model,
+        "api_key": api_key,
+    }
+
+
+# ------------- Public API -------------
+
+def llm_parse_enabled() -> bool:
+    """
+    Enabled only if httpx is available and both base_url/model are resolved.
+    Opt-out via LLM_PARSE_ENABLED=0.
+    """
+    if httpx is None:
+        return False
+    if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
+        return False
+    runtime = resolve_llm_runtime()
+    return bool(runtime["base_url"]) and bool(runtime["model"])
+
+
+_PROMPT_VERSION = "v3_sections_doc_type"
+_REVIEW_PROMPT_VERSION = "v1_review_merge"
+
+
+@dataclass
+class LLMExtraction:
+    roles: List[str]
+    skills: List[str]
+    primary_languages: List[str]
+    seniority: Optional[str]
+    backend_focus: Optional[bool]
+    experience_years_total: Optional[float]
+    experience_years_engineering: Optional[float]
+    english_level: Optional[str]
+    location: Optional[str]
+    remote_ok: Optional[bool]
+    salary_min_usd: Optional[int]
+    salary_max_usd: Optional[int]
+    salary_min_rub: Optional[int]
+    salary_max_rub: Optional[int]
+    highlights: List[str]
+    keywords: List[str]
+
+    @staticmethod
+    def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
+        def _as_list(v: Any) -> List[str]:
+            if v is None:
+                return []
+            if isinstance(v, list):
+                return [str(x).strip() for x in v if str(x).strip()]
+            s = str(v).strip()
+            return [s] if s else []
+
+        def _as_float(v: Any) -> Optional[float]:
+            try:
+                return float(v)
+            except Exception:
+                return None
+
+        def _as_int(v: Any) -> Optional[int]:
+            try:
+                return int(float(v))
+            except Exception:
+                return None
+
+        def _as_bool(v: Any) -> Optional[bool]:
+            if isinstance(v, bool):
+                return v
+            if v is None:
+                return None
+            s = str(v).strip().lower()
+            if s in ("true", "1", "yes", "y"):
+                return True
+            if s in ("false", "0", "no", "n"):
+                return False
+            return None
+
+        return LLMExtraction(
+            roles=_as_list(obj.get("roles")),
+            skills=_as_list(obj.get("skills")),
+            primary_languages=_as_list(obj.get("primary_languages")),
+            seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
+            backend_focus=_as_bool(obj.get("backend_focus")),
+            experience_years_total=_as_float(obj.get("experience_years_total")),
+            experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
+            english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
+            location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
+            remote_ok=_as_bool(obj.get("remote_ok")),
+            salary_min_usd=_as_int(obj.get("salary_min_usd")),
+            salary_max_usd=_as_int(obj.get("salary_max_usd")),
+            salary_min_rub=_as_int(obj.get("salary_min_rub")),
+            salary_max_rub=_as_int(obj.get("salary_max_rub")),
+            highlights=_as_list(obj.get("highlights")),
+            keywords=_as_list(obj.get("keywords")),
+        )
+
+
+def llm_extract_profile(
+    clean_text: str,
+    *,
+    con: Optional[sqlite3.Connection] = None,
+    doc_type: Optional[str] = None,
+    sections: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
+    """
+    Returns (LLMExtraction | None, debug_info).
+    - Uses cache on disk/sqlite to keep throughput high.
+    - Silently degrades to None on any failure.
+    """
+    runtime = resolve_llm_runtime()
+    dbg: Dict[str, Any] = {
+        "enabled": llm_parse_enabled(),
+        "provider": runtime.get("provider"),
+        "model": runtime.get("model"),
+        "from_cache": False,
+        "cache_backend": None,
+        "error": None,
+        "prompt_version": _PROMPT_VERSION,
+    }
+    if not llm_parse_enabled():
+        return None, dbg
+
+    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
+    cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"
+
+    payload = _build_payload(
+        clean_text,
+        doc_type=doc_type,
+        sections=sections,
+        prompt_version=_PROMPT_VERSION,
+        temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
+        max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
+        system_prompt="You output ONLY JSON for structured resume extraction.",
+        prompt_template=_PROMPT_TEMPLATE,
+    )
+
+    data = _cached_llm_json_call(
+        con=con,
+        cache_key=cache_key,
+        model=runtime["model"],
+        payload=payload,
+        dbg=dbg,
+    )
+    if data is None:
+        return None, dbg
+    return LLMExtraction.from_obj(data), dbg
+
+
+def llm_review_profile(
+    clean_text: str,
+    *,
+    draft: Dict[str, Any],
+    con: Optional[sqlite3.Connection] = None,
+    doc_type: Optional[str] = None,
+    sections: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
+    """
+    Second-pass validator:
+    - Takes already parsed JSON (draft)
+    - Re-checks every field against resume text
+    - Returns corrected extraction for safe merge in pipeline
+    """
+    runtime = resolve_llm_runtime()
+    dbg: Dict[str, Any] = {
+        "enabled": llm_parse_enabled(),
+        "provider": runtime.get("provider"),
+        "model": runtime.get("model"),
+        "from_cache": False,
+        "cache_backend": None,
+        "error": None,
+        "prompt_version": _REVIEW_PROMPT_VERSION,
+        "quality_score": None,
+        "changed_fields": [],
+        "issues_found": [],
+    }
+    if not llm_parse_enabled():
+        return None, dbg
+
+    clean_draft = _sanitize_review_draft(draft)
+    draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
+    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
+    draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
+    cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"
+
+    payload = _build_payload(
+        clean_text,
+        doc_type=doc_type,
+        sections=sections,
+        prompt_version=_REVIEW_PROMPT_VERSION,
+        temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
+        max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
+        system_prompt="You output ONLY JSON for resume parsing quality review.",
+        prompt_template=_REVIEW_PROMPT_TEMPLATE,
+        extra_vars={"draft_json": draft_blob},
+    )
+
+    data = _cached_llm_json_call(
+        con=con,
+        cache_key=cache_key,
+        model=runtime["model"],
+        payload=payload,
+        dbg=dbg,
+    )
+    if data is None:
+        return None, dbg
+
+    corrected_obj: Dict[str, Any]
+    if isinstance(data.get("corrected"), dict):
+        corrected_obj = data["corrected"]
+    else:
+        corrected_obj = data
+
+    dbg["quality_score"] = _as_float(data.get("quality_score"))
+    dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
+    dbg["issues_found"] = _as_str_list(data.get("issues_found"))
+
+    return LLMExtraction.from_obj(corrected_obj), dbg
+
+
+# ------------- Internal helpers -------------
+
+_PROMPT_TEMPLATE = """
+Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
+Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
+Схема:
+{{
+  "roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
+  "skills": ["python","go","k8s","postgres","react", "..."],
+  "primary_languages": ["python","go","java","c++", "..."],
+  "seniority": "intern|junior|middle|senior|lead|principal|null",
+  "backend_focus": true|false|null,
+  "experience_years_total": number|null,
+  "experience_years_engineering": number|null,
+  "english_level": "A1|A2|B1|B2|C1|C2|null",
+  "location": "city, country|null",
+  "remote_ok": true|false|null,
+  "salary_min_usd": int|null,
+  "salary_max_usd": int|null,
+  "salary_min_rub": int|null,
+  "salary_max_rub": int|null,
+  "highlights": ["кратко достижения (1-2 предложения)"],
+  "keywords": ["уникальные ключевые слова, продукты или домены"]
+}}
+Не включай контактные данные в skills/keywords.
+Detected doc_type: {doc_type}
+Sections (if present):
+{sections_block}
+
+Full text snippet (use only if needed):
+```TEXT
+{resume_text}
+```
+"""
+
+_REVIEW_PROMPT_TEMPLATE = """
+Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
+У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
+Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.
+
+Верни JSON строго такой формы:
+{{
+  "corrected": {{
+    "roles": ["..."],
+    "skills": ["..."],
+    "primary_languages": ["..."],
+    "seniority": "intern|junior|middle|senior|lead|principal|null",
+    "backend_focus": true|false|null,
+    "experience_years_total": number|null,
+    "experience_years_engineering": number|null,
+    "english_level": "A1|A2|B1|B2|C1|C2|null",
+    "location": "city, country|null",
+    "remote_ok": true|false|null,
+    "salary_min_usd": int|null,
+    "salary_max_usd": int|null,
+    "salary_min_rub": int|null,
+    "salary_max_rub": int|null,
+    "highlights": ["..."],
+    "keywords": ["..."]
+  }},
+  "changed_fields": ["field_name", "..."],
+  "issues_found": ["кратко что было неверно/сомнительно", "..."],
+  "quality_score": 0.0
+}}
+
+Черновик JSON:
+```DRAFT
+{draft_json}
+```
+
+Detected doc_type: {doc_type}
+Sections (if present):
+{sections_block}
+
+Full text snippet (use only if needed):
+```TEXT
+{resume_text}
+```
+"""
+
+
+def _trim_text(text: str, max_len: int = 9000) -> str:
+    """
+    Keep head and tail to preserve summary + recent projects.
+    """
+    if len(text) <= max_len:
+        return text
+    head = text[: max_len // 2]
+    tail = text[-max_len // 2 :]
+    return head + "\n...\n" + tail
+
+
+def _build_payload(
+    clean_text: str,
+    *,
+    doc_type: Optional[str],
+    sections: Optional[Dict[str, str]],
+    prompt_version: str,
+    temperature: float,
+    max_tokens: int,
+    system_prompt: str,
+    prompt_template: str,
+    extra_vars: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    runtime = resolve_llm_runtime()
+    base_url = runtime["base_url"]
+    model = runtime["model"]
+
+    sections_block = _build_sections_block(sections)
+    tpl_vars = {
+        "resume_text": _trim_text(clean_text),
+        "doc_type": (doc_type or "unknown"),
+        "sections_block": sections_block or "(no sections detected)",
+    }
+    if extra_vars:
+        tpl_vars.update(extra_vars)
+
+    prompt = prompt_template.format(**tpl_vars)
+
+    return {
+        "base_url": base_url,
+        "model": model,
+        "prompt_version": prompt_version,
+        "payload": {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        },
+        "headers": _build_headers(runtime),
+        "timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
+    }
+
+
+def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
+    headers = {"Content-Type": "application/json"}
+    api_key = runtime.get("api_key", "")
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    return headers
+
+
+def _cached_llm_json_call(
+    *,
+    con: Optional[sqlite3.Connection],
+    cache_key: str,
+    model: str,
+    payload: Dict[str, Any],
+    dbg: Dict[str, Any],
+) -> Optional[Dict[str, Any]]:
+    data = _cache_get_sqlite(con, cache_key)
+    if data:
+        dbg["from_cache"] = True
+        dbg["cache_backend"] = "sqlite"
+        return data
+
+    cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
+    cache_ok = True
+    try:
+        cache_dir.mkdir(parents=True, exist_ok=True)
+    except Exception:
+        cache_ok = False
+
+    safe_name = cache_key.replace(":", "_")
+    cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None
+
+    if cache_path and cache_path.exists():
+        try:
+            data = json.loads(cache_path.read_text(encoding="utf-8"))
+            dbg["from_cache"] = True
+            dbg["cache_backend"] = "disk"
+            return data
+        except Exception:
+            pass
+
+    try:
+        data = _llm_call_json(payload)
+        if con:
+            _cache_put_sqlite(con, cache_key, model, data)
+        if cache_path:
+            cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
+        return data
+    except Exception as e:  # pragma: no cover - network/LLM failures
+        dbg["error"] = repr(e)
+        return None
+
+
+def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
+    if httpx is None:
+        raise RuntimeError("httpx is not installed")
+
+    base_url: str = task["base_url"]
+    payload: Dict[str, Any] = task["payload"]
+    timeout = float(task.get("timeout", 18.0))
+
+    with httpx.Client(timeout=timeout) as client:
+        r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
+        r.raise_for_status()
+        data = r.json()
+
+    content = data["choices"][0]["message"]["content"]
+    if isinstance(content, list):
+        parts = []
+        for block in content:
+            if isinstance(block, dict):
+                parts.append(str(block.get("text") or ""))
+            else:
+                parts.append(str(block))
+        content = "\n".join(parts)
+    content = str(content)
+
+    m = re.search(r"\{.*\}", content, flags=re.S)
+    if not m:
+        raise ValueError("LLM did not return JSON")
+    return json.loads(m.group(0))
+
+
+def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
+    if not sections:
+        return ""
+    parts: List[str] = []
+    order = [
+        ("about", "ABOUT"),
+        ("skills", "SKILLS"),
+        ("experience", "EXPERIENCE"),
+        ("education", "EDUCATION"),
+        ("contacts", "CONTACTS"),
+    ]
+    for key, label in order:
+        text = sections.get(key)
+        if not text:
+            continue
+        snippet = _trim_text(text, max_len=1800)
+        parts.append(f"[{label}]\n{snippet}")
+    return "\n\n".join(parts)
+
+
+def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
+    if not isinstance(draft, dict):
+        draft = {}
+
+    allowed = {
+        "roles",
+        "skills",
+        "primary_languages",
+        "seniority",
+        "backend_focus",
+        "experience_years_total",
+        "experience_years_engineering",
+        "english_level",
+        "location",
+        "remote_ok",
+        "salary_min_usd",
+        "salary_max_usd",
+        "salary_min_rub",
+        "salary_max_rub",
+        "highlights",
+        "keywords",
+    }
+    cleaned = {k: v for k, v in draft.items() if k in allowed}
+    return asdict(LLMExtraction.from_obj(cleaned))
+
+
+def _as_float(v: Any) -> Optional[float]:
+    try:
+        x = float(v)
+    except Exception:
+        return None
+    if x < 0:
+        return None
+    if x > 1.0:
+        return 1.0
+    return x
+
+
+def _as_str_list(v: Any) -> List[str]:
+    if v is None:
+        return []
+    if isinstance(v, list):
+        return [str(x).strip() for x in v if str(x).strip()]
+    s = str(v).strip()
+    return [s] if s else []
+
+
+def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
+    if con is None:
+        return None
+    try:
+        row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
+        if row and row["result_json"]:
+            return json.loads(row["result_json"])
+    except Exception:
+        return None
+    return None
+
+
+def _cache_put_sqlite(
+    con: Optional[sqlite3.Connection],
+    cache_key: str,
+    model: str,
+    data: Dict[str, Any],
+) -> None:
+    if con is None:
+        return
+    try:
+        con.execute(
+            "INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
+            (cache_key, model, json.dumps(data, ensure_ascii=False)),
+        )
+    except Exception:
+        return
diff --git a/extract/parse.py b/extract/parse.py
new file mode 100644
index 0000000..f868fb4
--- /dev/null
+++ b/extract/parse.py
@@ -0,0 +1,659 @@
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+from tg_resume_db.normalize import normalize_skill
+from tg_resume_db.extract.experience import extract_experience
+
+EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
+EMAIL_SPLIT_RE = re.compile(
+    r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
+    r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
+    re.I,
+)
+PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
+TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
+GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
+LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
+URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
+
+EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
+EN_TEXT_RE = re.compile(
+    r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
+    re.I,
+)
+EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
+
+REMOTE_RE = re.compile(
+    r"\b("
+    r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
+    r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
+    r")\b",
+    re.I,
+)
+
+# Salary (rough)
+CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
+NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
+SALARY_HINT_RE = re.compile(
+    r"\b("
+    r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
+    r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
+    r")\b",
+    re.I,
+)
+PAY_TOKEN_RE = re.compile(
+    r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
+    re.I,
+)
+SALARY_NOISE_RE = re.compile(
+    r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
+    r"companies?|followers?|downloads?|clients?)\b",
+    re.I,
+)
+
+SECTION_HEADER_RE = re.compile(
+    r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
+    re.I,
+)
+LOCATION_CITY_COUNTRY_RE = re.compile(
+    r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
+)
+
+# --- SKILLS & ROLES ---
+
+SKILLS = {
+    "python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
+    "sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
+    "aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
+    "pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
+}
+
+_SKILL_ALIASES: Dict[str, List[str]] = {
+    "javascript": ["java script", "java-script", "js"],
+    "typescript": ["type script", "type-script", "ts"],
+    "postgresql": ["postgres", "postgre sql", "postgre-sql"],
+    "graphql": ["graph ql"],
+    "grpc": ["g rpc"],
+}
+
+
+def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
+    patterns: List[Tuple[str, re.Pattern]] = []
+    for skill in sorted(SKILLS):
+        aliases = [skill] + _SKILL_ALIASES.get(skill, [])
+        for alias in aliases:
+            if skill == "java" and alias == "java":
+                # Do not match "java" inside "java script".
+                pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
+            else:
+                pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
+            patterns.append((skill, pat))
+    return patterns
+
+
+_SKILL_PATTERNS = _build_skill_patterns()
+
+ROLES = {
+    "backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
+    "mobile","android","ios","team lead","tech lead","architect"
+}
+
+_ROLE_ALIASES: Dict[str, List[str]] = {
+    "backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
+    "frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
+    "fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
+    "devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
+    "qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
+    "sre": ["sre", "site reliability"],
+    "data engineer": ["data engineer"],
+    "data scientist": ["data scientist"],
+    "ml engineer": ["ml engineer", "machine learning engineer"],
+    "mobile": ["mobile developer", "mobile engineer"],
+    "android": ["android developer", "android engineer"],
+    "ios": ["ios developer", "ios engineer"],
+    "team lead": ["team lead", "teamlead"],
+    "tech lead": ["tech lead", "techlead"],
+    "architect": ["architect", "solution architect", "software architect"],
+}
+
+
+def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
+    out: Dict[str, List[re.Pattern]] = {}
+    for role in ROLES:
+        aliases = _ROLE_ALIASES.get(role, [role])
+        out[role] = [
+            re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
+            for a in aliases
+        ]
+    return out
+
+
+_ROLE_PATTERNS = _build_role_patterns()
+
+# --- HR / RECRUITER FILTERS ---
+# Words that indicate the line is about searching for candidates, not owning the skill.
+HR_CONTEXT_RE = re.compile(
+    r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
+    r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
+    re.I
+)
+
+# Roles that explicitly define the person as Non-Engineering
+NON_TECH_ROLES_RE = re.compile(
+    r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
+    re.I
+)
+
+# --- EXPERIENCE ---
+
+AGE_LINE_RE = re.compile(
+    r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
+)
+
+EXP_HEADER_RE = re.compile(
+    r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
+)
+
+# "5 years 10 months"
+EXP_SUMMARY_RE = re.compile(
+    r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
+    r"[^0-9]{0,20}"
+    r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
+    r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
+)
+
+EXP_NEARBY_RE = re.compile(
+    r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
+    r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
+)
+
+HH_FOOTER_RE = re.compile(
+    r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
+    re.I,
+)
+NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
+NAME_LINE_RE = re.compile(
+    r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
+)
+NAME_STOPWORDS = {
+    "resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
+    "projects", "about", "profile", "objective", "навыки", "опыт", "образование",
+    "контакты", "профиль", "цель", "резюме",
+    "developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
+    "backend developer", "frontend developer", "fullstack developer", "software engineer",
+    "разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
+    "top skills", "experience", "education", "languages", "certifications",
+    "skills & endorsements", "endorsements",
+    "university", "state university", "institute", "college", "academy", "school",
+    "bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
+    "колледж", "школа", "бакалавр", "магистр", "факультет",
+}
+
+_NAME_BAD_WORDS = {
+    "skills", "top skills", "experience", "education", "languages", "certifications",
+    "projects", "summary", "about", "profile", "endorsements",
+    "university", "institute", "college", "academy", "school",
+    "bachelor", "master", "degree", "faculty",
+}
+
+NAME_INSTITUTION_RE = re.compile(
+    r"\b("
+    r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
+    r"mathematics|computer science|informatics|physics|economics|management|"
+    r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
+    r"математик|информатик|физик|экономик|менеджмент"
+    r")\b",
+    re.I,
+)
+
+_EMAIL_PREFIX_STOP = {
+    "email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
+}
+
+
+def _prune_fragment_emails(values: List[str]) -> List[str]:
+    uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
+    out: List[str] = []
+    for e in uniq:
+        local, domain = e.split("@", 1)
+        drop = False
+        for other in uniq:
+            if other == e:
+                continue
+            ol, od = other.split("@", 1)
+            if od != domain:
+                continue
+            if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
+                drop = True
+                break
+        if not drop:
+            out.append(e)
+    return out
+
+
+def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
+    """
+    Returns (total_years, engineering_years, confidence, debug).
+    
+    Logic:
+    1. Calculate TOTAL experience from summaries.
+    2. Check if the candidate is primarily a Recruiter/HR.
+       - If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
+       - If NO: engineering_years = total_years (Optimistic assumption for valid devs).
+    """
+    dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
+    
+    total_years: Optional[float] = None
+    confidence = 0.0
+
+    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
+    
+    # 1. Detect if Recruiter
+    # Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
+    header_text = "\n".join(lines[:15])
+    is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
+    dbg["is_recruiter"] = is_recruiter
+
+    # 2. Extract Total Duration
+    if lines:
+        # Strategy A: Explicit summary
+        for i, ln in enumerate(lines[:200]): 
+            if AGE_LINE_RE.search(ln): continue
+            
+            # Look for summary line
+            if EXP_HEADER_RE.search(ln):
+                window = ln
+                if i + 1 < len(lines): window += " " + lines[i+1]
+                if i + 2 < len(lines): window += " " + lines[i+2]
+                
+                m = EXP_SUMMARY_RE.search(window)
+                if m:
+                    y = int(m.group("y"))
+                    mm = int(m.group("m")) if m.group("m") else 0
+                    total_years = float(round(y + (mm / 12.0), 2))
+                    if 0 <= total_years <= 60:
+                        dbg["method"] = "summary"
+                        dbg["matched"] = m.group(0)
+                        confidence = 0.95
+                        break
+        
+        # Strategy B: Fallback nearby
+        if total_years is None:
+            safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
+            for i, ln in enumerate(safe_lines):
+                if not EXP_HEADER_RE.search(ln): continue
+                chunk = " ".join(safe_lines[i : i + 12])
+                m = EXP_NEARBY_RE.search(chunk)
+                if m:
+                    y = int(m.group("y"))
+                    mm = int(m.group("m")) if m.group("m") else 0
+                    val = float(round(y + (mm / 12.0), 2))
+                    if 0 <= val <= 60:
+                        total_years = val
+                        dbg["method"] = "header_chunk"
+                        dbg["matched"] = m.group(0)
+                        confidence = 0.80
+                        break
+
+    # 2.5 Timeline/range fallback-reconciliation
+    # Protects against cases where summary parser catches one short fragment
+    # while CV has a long timeline.
+    try:
+        alt = extract_experience(text or "")
+    except Exception:
+        alt = None
+    if alt and alt.years is not None:
+        if total_years is None:
+            total_years = alt.years
+            confidence = max(confidence, alt.confidence)
+            dbg["method"] = "timeline_fallback"
+            dbg["matched"] = "date_ranges"
+        elif alt.years > (total_years + 1.0):
+            strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
+            if strong_summary and (alt.years - float(total_years)) > 1.5:
+                dbg["reconcile"] = "timeline_skip_strong_summary"
+            else:
+                total_years = alt.years
+                confidence = max(confidence, min(0.82, alt.confidence))
+                dbg["method"] = "timeline_reconcile"
+                dbg["matched"] = "date_ranges"
+
+    # 3. Calculate Engineering Years
+    eng_years = total_years
+    if is_recruiter:
+        # If they are a recruiter, their "engineering" experience is effectively 0 
+        # for the purpose of finding a Developer.
+        eng_years = 0.0
+    
+    return total_years, eng_years, confidence, dbg
+
+
+def _norm_phone(p: str) -> str:
+    digits = re.sub(r"\D+", "", p)
+    if digits.startswith("8") and len(digits) == 11:
+        digits = "7" + digits[1:]
+    return "+" + digits if digits else ""
+
+def _norm_token(s: str) -> str:
+    return re.sub(r"\s+", " ", s.strip().lower())
+
+def safe_json(v) -> str:
+    return json.dumps(v, ensure_ascii=False)
+
+def extract_contacts(text: str) -> Dict[str, List[str]]:
+    emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
+    for m in EMAIL_SPLIT_RE.finditer(text or ""):
+        prefix = m.group("prefix").strip().lower().strip(".-_")
+        if not prefix or prefix in _EMAIL_PREFIX_STOP:
+            continue
+        if not re.search(r"[._\-\d]", prefix):
+            continue
+        tail = m.group("tail").lower()
+        if "@" not in tail:
+            continue
+        local_tail, domain = tail.split("@", 1)
+        local = f"{prefix}{local_tail}"
+        if len(local) > 64:
+            continue
+        cand = f"{local}@{domain}"
+        if EMAIL_RE.fullmatch(cand):
+            emails_set.add(cand)
+    emails = _prune_fragment_emails(sorted(emails_set))
+    phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
+    tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
+    gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
+    li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
+    urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
+    return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
+
+def extract_name_guess(text: str) -> Optional[str]:
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if not lines:
+        return None
+
+    # 1) HH footer "Name • Резюме обновлено ..."
+    m = HH_FOOTER_RE.search(text or "")
+    if m:
+        cand = m.group("name").strip()
+        if _looks_like_name_line(cand):
+            return cand
+
+    # 2) Key-value line: "Name: ..." / "Имя: ..."
+    for ln in lines[:40]:
+        m2 = NAME_KV_RE.match(ln)
+        if m2:
+            cand = m2.group(2).strip()
+            cand = re.split(r"[|,/;]", cand)[0].strip()
+            if _looks_like_name_line(cand):
+                return cand
+
+    # 3) Name-like in first ~40 lines
+    for ln in lines[:40]:
+        if _looks_like_heading_line(ln):
+            continue
+        if _looks_like_name_line(ln):
+            return ln
+
+    # 4) Name-like near the end (pptx exports often put name there)
+    tail_start = max(0, len(lines) - 60)
+    for i in range(tail_start, len(lines)):
+        ln = lines[i]
+        if _looks_like_heading_line(ln):
+            continue
+        ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
+        if NAME_INSTITUTION_RE.search(ctx):
+            continue
+        if _looks_like_name_line(ln):
+            return ln
+
+    return None
+
+
+def _looks_like_heading_line(line: str) -> bool:
+    low = (line or "").strip().lower()
+    if not low:
+        return False
+    if low in _NAME_BAD_WORDS:
+        return True
+    if low.startswith("top skills"):
+        return True
+    if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
+        return True
+    return False
+
+
+def _looks_like_name_line(line: str) -> bool:
+    if not line:
+        return False
+    if len(line) > 80:
+        return False
+    low = line.lower().strip()
+    if low in NAME_STOPWORDS:
+        return False
+    if _looks_like_heading_line(line):
+        return False
+    if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
+        return False
+    if NAME_INSTITUTION_RE.search(line):
+        return False
+    if not NAME_LINE_RE.match(line.strip()):
+        return False
+    return True
+
+def extract_remote(text: str) -> Optional[bool]:
+    if not text:
+        return None
+    for ln in text.splitlines()[:120]:
+        if REMOTE_RE.search(ln):
+            return True
+    return None
+
+def extract_english(text: str) -> Optional[str]:
+    t = text or ""
+    lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
+
+    # 1) CEFR levels anywhere are accepted.
+    m = EN_RE.search(t)
+    if m:
+        return m.group(1).replace("+", "").upper()
+
+    # 2) Textual levels only when English context is present.
+    candidate_chunks: List[str] = []
+    for i, ln in enumerate(lines):
+        if EN_LANG_RE.search(ln):
+            candidate_chunks.append(ln)
+            if i + 1 < len(lines):
+                candidate_chunks.append(lines[i + 1])
+
+    if not candidate_chunks:
+        return None
+
+    m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
+    if not m2:
+        return None
+    word = m2.group(1).lower()
+    if word in ("native", "fluent", "proficient", "advanced"):
+        return "C1"
+    if word.startswith("upper"):
+        return "B2"
+    if word == "intermediate":
+        return "B1"
+    if word == "elementary":
+        return "A2"
+    return None
+
+def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
+    """
+    Extracts roles and skills, but strictly filters out HR/Recruitment context.
+    """
+    lines = text.splitlines()
+    
+    # 1. Filter text: Remove lines that talk about hiring/vacancies
+    clean_lines = []
+    for ln in lines:
+        if not HR_CONTEXT_RE.search(ln):
+            clean_lines.append(ln)
+    
+    clean_text = "\n".join(clean_lines).lower()
+    
+    # 2. Extract Skills from clean text only
+    skills = []
+    for s, pat in _SKILL_PATTERNS:
+        if pat.search(clean_text):
+            skills.append(normalize_skill(s) or s)
+    skills = sorted(set(skills))
+    
+    # 3. Extract Roles
+    # Priority: Header (first 10 lines)
+    header_text = "\n".join(lines[:10]).lower()
+    
+    found_roles = set()
+    
+    # Check if Recruiter
+    if NON_TECH_ROLES_RE.search(header_text):
+        # If explicit recruiter in header, do NOT add generic tech roles like "backend"
+        # even if they appear in the text (often describes who they hire).
+        pass
+    else:
+        # Normal extraction
+        for r in ROLES:
+            pats = _ROLE_PATTERNS.get(r, [])
+            if any(p.search(clean_text) for p in pats):
+                # extra guard: devops requires explicit evidence, not just CI/CD mentions
+                if r == "devops":
+                    if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
+                        continue
+                found_roles.add(r)
+
+    return sorted(list(found_roles)), skills
+
+def norm_pipe(tokens: List[str]) -> str:
+    toks = [_norm_token(t) for t in tokens if _norm_token(t)]
+    uniq = sorted(set(toks))
+    return "|" + "|".join(uniq) + "|" if uniq else "|"
+
+def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
+    dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
+    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
+    if not lines:
+        return None, None, 0.0, dbg
+
+    candidates: List[Tuple[int, str, bool, bool]] = []
+    for i, ln in enumerate(lines):
+        has_hint = SALARY_HINT_RE.search(ln) is not None
+        has_pay = PAY_TOKEN_RE.search(ln) is not None
+        if not has_hint and not has_pay:
+            continue
+        if SALARY_NOISE_RE.search(ln) and not has_hint:
+            continue
+        candidates.append((i, ln, has_hint, has_pay))
+
+    if not candidates:
+        return None, None, 0.0, dbg
+
+    has_hint = any(x[2] for x in candidates)
+    if not has_hint:
+        # Inline pay without "salary" is allowed only near header/contact block.
+        candidates = [x for x in candidates if x[0] < 15]
+        if not candidates:
+            return None, None, 0.0, dbg
+
+    scan_chunks: List[str] = []
+    for i, ln, hint, _ in candidates:
+        chunk = ln
+        if hint and (i + 1) < len(lines):
+            chunk = f"{chunk} {lines[i + 1]}"
+        scan_chunks.append(chunk)
+        dbg["used_lines"].append(ln)
+        if hint:
+            dbg["hint_lines"] += 1
+        dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
+
+    nums: List[int] = []
+    for chunk in scan_chunks:
+        for m in NUM_RE.finditer(chunk):
+            val = None
+            if m.group(1) and m.group(2):
+                val = int(m.group(1)) * 1000
+            elif m.group(3):
+                val = int(re.sub(r"\s+", "", m.group(3)))
+            elif m.group(4):
+                val = int(m.group(4))
+            if val and 20_000 <= val <= 30_000_000:
+                nums.append(val)
+                dbg["numbers"].append(val)
+
+    if not nums:
+        return None, None, 0.0, dbg
+
+    nums = sorted(nums)
+    salary_min = nums[0]
+    salary_max = nums[-1] if len(nums) > 1 else nums[0]
+
+    if dbg["hint_lines"] > 0:
+        conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
+    else:
+        conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
+
+    if salary_max > salary_min * 4:
+        conf -= 0.12
+    if len(nums) == 1:
+        conf -= 0.06
+
+    conf = max(0.0, min(conf, 0.9))
+    if conf < 0.45:
+        return None, None, conf, dbg
+    return salary_min, salary_max, conf, dbg
+
+def extract_location_best_effort(text: str) -> Optional[str]:
+    if not text:
+        return None
+
+    def _clean_loc(val: str) -> str:
+        return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
+
+    def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
+        v = _clean_loc(val)
+        if not v or len(v) < 3 or len(v) > 90:
+            return False
+        if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
+            return False
+        if SECTION_HEADER_RE.match(v):
+            return False
+        if LOCATION_CITY_COUNTRY_RE.match(v):
+            return True
+        if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
+            return True
+        return False
+
+    patterns = [
+        re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
+        re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
+        re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
+    ]
+    for p in patterns:
+        m = p.search(text)
+        if m:
+            val = _clean_loc(m.group(2))
+            if _is_loc_like(val, allow_single=True):
+                return val
+
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    head: List[str] = []
+    for ln in lines[:60]:
+        if SECTION_HEADER_RE.match(ln):
+            low = ln.lower()
+            if low in ("contacts", "contact", "contact info"):
+                continue
+            break
+        head.append(ln)
+
+    for ln in head:
+        parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
+        for seg in parts:
+            if _is_loc_like(seg):
+                return _clean_loc(seg)
+    return None
diff --git a/extract/pdf_extract.py b/extract/pdf_extract.py
new file mode 100644
index 0000000..cb2da4b
--- /dev/null
+++ b/extract/pdf_extract.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import re
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+try:  # optional dependency
+    from pypdf import PdfReader  # type: ignore
+except Exception:  # pragma: no cover
+    try:
+        from PyPDF2 import PdfReader  # type: ignore
+    except Exception:  # pragma: no cover
+        PdfReader = None  # type: ignore
+
+try:  # optional dependency
+    from pdfminer.high_level import extract_text as pdfminer_extract_text  # type: ignore
+except Exception:  # pragma: no cover
+    pdfminer_extract_text = None  # type: ignore
+
+
+@dataclass
+class PdfExtractResult:
+    text: str
+    pages: List[dict]
+    method: str
+    score: float
+    flags: List[str]
+
+
+_SECTION_HINTS = [
+    "experience", "work experience", "skills", "education", "projects", "summary", "about",
+    "опыт работы", "навыки", "образование", "проекты", "о себе",
+]
+
+
+def _which_pdftotext() -> Optional[str]:
+    exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
+    return exe
+
+
+def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
+    exe = _which_pdftotext()
+    if not exe:
+        return ""
+    cmd = [exe]
+    if layout:
+        cmd.append("-layout")
+    cmd += ["-nopgbrk", str(path), "-"]
+    try:
+        p = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=timeout_sec,
+            check=False,
+            text=True,
+            encoding="utf-8",
+            errors="ignore",
+        )
+        return (p.stdout or "").strip()
+    except Exception:
+        return ""
+
+
+def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
+    if PdfReader is None:
+        return []
+    try:
+        reader = PdfReader(str(path), strict=False)
+    except Exception:
+        return []
+    pages: List[dict] = []
+    for i, page in enumerate(getattr(reader, "pages", [])):
+        if max_pages and i >= max_pages:
+            break
+        try:
+            text = page.extract_text() or ""
+        except Exception:
+            text = ""
+        pages.append({"page": i + 1, "text": text})
+    return pages
+
+
+def _extract_pdfminer(path: Path) -> str:
+    if pdfminer_extract_text is None:
+        return ""
+    try:
+        return (pdfminer_extract_text(str(path)) or "").strip()
+    except Exception:
+        return ""
+
+
+def _quality_score(text: str) -> Tuple[float, List[str]]:
+    flags: List[str] = []
+    if not text:
+        return 0.0, ["empty"]
+
+    total = len(text)
+    letters = sum(ch.isalpha() for ch in text)
+    spaces = text.count(" ")
+    alpha_ratio = letters / max(1, total)
+    space_ratio = spaces / max(1, total)
+
+    words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
+    avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
+
+    lines = [ln for ln in text.splitlines() if ln.strip()]
+    long_lines = [ln for ln in lines if len(ln) > 200]
+    long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
+
+    glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
+
+    section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
+
+    score = 0.0
+    if alpha_ratio >= 0.45:
+        score += 2.0
+    elif alpha_ratio >= 0.30:
+        score += 1.0
+    else:
+        flags.append("low_alpha")
+
+    if 0.10 <= space_ratio <= 0.28:
+        score += 1.0
+    else:
+        flags.append("odd_spacing")
+
+    if 3.5 <= avg_word_len <= 9.0:
+        score += 1.0
+    else:
+        flags.append("odd_word_len")
+
+    if long_line_ratio <= 0.06:
+        score += 1.0
+    else:
+        flags.append("long_lines")
+
+    if glued_hits <= 6:
+        score += 1.0
+    else:
+        flags.append("glued_text")
+
+    if section_hits >= 2:
+        score += 1.0
+    elif section_hits == 1:
+        score += 0.5
+
+    if total < 200:
+        flags.append("short_text")
+
+    if alpha_ratio < 0.08 or total < 120:
+        flags.append("scan_like")
+
+    return score, flags
+
+
+def deglue_text(text: str) -> str:
+    if not text:
+        return text
+    t = text
+    t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
+    t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
+    t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
+    t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
+    return t
+
+
+def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
+    candidates: List[Tuple[str, str]] = []
+
+    txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
+    if txt_layout:
+        candidates.append(("pdftotext_layout", txt_layout))
+
+    txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
+    if txt_plain:
+        candidates.append(("pdftotext_plain", txt_plain))
+
+    txt_pypdf = ""
+    if PdfReader is not None:
+        pages = _extract_pages_pypdf(path)
+        if pages:
+            txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
+    if txt_pypdf:
+        candidates.append(("pypdf", txt_pypdf))
+
+    txt_pdfminer = _extract_pdfminer(path)
+    if txt_pdfminer:
+        candidates.append(("pdfminer", txt_pdfminer))
+
+    if not candidates:
+        return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
+
+    best_method = "none"
+    best_text = ""
+    best_score = -1.0
+    best_flags: List[str] = []
+    for method, text in candidates:
+        score, flags = _quality_score(text)
+        if score > best_score:
+            best_score = score
+            best_method = method
+            best_text = text
+            best_flags = flags
+
+    pages = _extract_pages_pypdf(path)
+    best_text = deglue_text(best_text)
+    return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)
diff --git a/extract/sections.py b/extract/sections.py
new file mode 100644
index 0000000..8432149
--- /dev/null
+++ b/extract/sections.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+
+_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
+    "contacts": [
+        re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
+    ],
+    "about": [
+        re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
+    ],
+    "skills": [
+        re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
+    ],
+    "experience": [
+        re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
+    ],
+    "education": [
+        re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
+    ],
+    "projects": [
+        re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
+    ],
+    "languages": [
+        re.compile(r"^\s*(languages?|языки)\s*$", re.I),
+    ],
+    "certifications": [
+        re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
+    ],
+    "publications": [
+        re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
+    ],
+}
+
+
+def _match_header(line: str) -> Optional[str]:
+    for key, patterns in _SECTION_PATTERNS.items():
+        for rx in patterns:
+            if rx.match(line):
+                return key
+    return None
+
+
+def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
+    lines = [ln.strip() for ln in (clean_text or "").splitlines()]
+    sections: Dict[str, List[str]] = {"header": []}
+    current = "header"
+
+    for ln in lines:
+        if not ln:
+            continue
+        key = _match_header(ln)
+        if key:
+            current = key
+            sections.setdefault(current, [])
+            continue
+        sections.setdefault(current, []).append(ln)
+
+    out: Dict[str, str] = {}
+    for k, vals in sections.items():
+        text = "\n".join(vals).strip()
+        if text:
+            out[k] = text
+    return out
+
+
+def sections_present(sections: Dict[str, str]) -> List[str]:
+    return sorted([k for k, v in (sections or {}).items() if v and k != "header"])
diff --git a/extract/templates/__init__.py b/extract/templates/__init__.py
new file mode 100644
index 0000000..a9a2c5b
--- /dev/null
+++ b/extract/templates/__init__.py
@@ -0,0 +1 @@
+__all__ = []
diff --git a/extract/templates/generic.py b/extract/templates/generic.py
new file mode 100644
index 0000000..e6712a4
--- /dev/null
+++ b/extract/templates/generic.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    text = clean_text or ""
+
+    contacts_raw = extract_contacts(text)
+    name = extract_name_guess(text)
+    remote = extract_remote(text)
+    english = extract_english(text)
+    roles, skills = extract_roles_skills(text)
+    location = extract_location_best_effort(text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "generic_heur",
+    }
diff --git a/extract/templates/hh.py b/extract/templates/hh.py
new file mode 100644
index 0000000..418de83
--- /dev/null
+++ b/extract/templates/hh.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
+    if not sections:
+        return fallback
+    return sections.get(key) or fallback
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    header_text = _pick(sections, "header", clean_text)
+    contacts_text = _pick(sections, "contacts", clean_text)
+    about_text = _pick(sections, "about", clean_text)
+    skills_text = _pick(sections, "skills", clean_text)
+    exp_text = _pick(sections, "experience", clean_text)
+    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
+
+    name = extract_name_guess(header_text)
+    contacts_raw = extract_contacts(contacts_text)
+    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
+
+    remote = extract_remote(clean_text)
+    english = extract_english(clean_text)
+    location = extract_location_best_effort(clean_text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "hh_template",
+    }
diff --git a/extract/templates/hh_ru.py b/extract/templates/hh_ru.py
new file mode 100644
index 0000000..d6f1c7b
--- /dev/null
+++ b/extract/templates/hh_ru.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Optional
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+_DESIRED_RE = re.compile(r"(?i)жел[а-я]*\s+должност[ьи]\s*[:\-]?\s*(.+)")
+_SPEC_RE = re.compile(r"(?i)специализаци[яи]\s*[:\-]?\s*(.+)")
+_SCHEDULE_RE = re.compile(r"(?i)график\s+работы\s*[:\-]?\s*(.+)")
+_EMPLOYMENT_RE = re.compile(r"(?i)занятость\s*[:\-]?\s*(.+)")
+
+
+def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
+    if not sections:
+        return fallback
+    return sections.get(key) or fallback
+
+
+def _find_first(regex: re.Pattern, text: str) -> Optional[str]:
+    for ln in text.splitlines():
+        m = regex.search(ln)
+        if m:
+            val = m.group(1).strip()
+            val = re.split(r"[|;/]", val)[0].strip()
+            if 2 <= len(val) <= 80:
+                return val
+    return None
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    header_text = _pick(sections, "header", clean_text)
+    contacts_text = _pick(sections, "contacts", clean_text)
+    about_text = _pick(sections, "about", clean_text)
+    skills_text = _pick(sections, "skills", clean_text)
+    exp_text = _pick(sections, "experience", clean_text)
+    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
+
+    name = extract_name_guess(header_text)
+    contacts_raw = extract_contacts(contacts_text)
+    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
+
+    remote = extract_remote(clean_text)
+    english = extract_english(clean_text)
+    location = extract_location_best_effort(clean_text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
+
+    desired_title = _find_first(_DESIRED_RE, clean_text)
+    specializations = _find_first(_SPEC_RE, clean_text)
+    schedule = _find_first(_SCHEDULE_RE, clean_text)
+    employment = _find_first(_EMPLOYMENT_RE, clean_text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "desired_title": desired_title,
+        "specializations": specializations,
+        "employment_type": employment,
+        "schedule": schedule,
+        "parse_method": "hh_template",
+    }
diff --git a/extract/templates/linkedin.py b/extract/templates/linkedin.py
new file mode 100644
index 0000000..294ad00
--- /dev/null
+++ b/extract/templates/linkedin.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
+    if not sections:
+        return fallback
+    return sections.get(key) or fallback
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    header_text = _pick(sections, "header", clean_text)
+    about_text = _pick(sections, "about", clean_text)
+    skills_text = _pick(sections, "skills", clean_text)
+    exp_text = _pick(sections, "experience", clean_text)
+    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
+
+    name = extract_name_guess(header_text)
+    contacts_raw = extract_contacts(clean_text)
+    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
+
+    remote = extract_remote(clean_text)
+    english = extract_english(clean_text)
+    location = extract_location_best_effort(clean_text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "linkedin_template",
+    }
diff --git a/extract/templates/one_page.py b/extract/templates/one_page.py
new file mode 100644
index 0000000..5282df5
--- /dev/null
+++ b/extract/templates/one_page.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    text = clean_text or ""
+
+    contacts_raw = extract_contacts(text)
+    name = extract_name_guess(text)
+    roles, skills = extract_roles_skills(text)
+    remote = extract_remote(text)
+    english = extract_english(text)
+    location = extract_location_best_effort(text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "one_page_template",
+    }
diff --git a/extract/templates/one_page_en.py b/extract/templates/one_page_en.py
new file mode 100644
index 0000000..696e67e
--- /dev/null
+++ b/extract/templates/one_page_en.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.templates.one_page import parse_resume as _parse
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    out = _parse(clean_text, sections)
+    out["parse_method"] = "one_page_en"
+    return out
diff --git a/extract/templates/one_page_ru.py b/extract/templates/one_page_ru.py
new file mode 100644
index 0000000..24610cf
--- /dev/null
+++ b/extract/templates/one_page_ru.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.templates.one_page import parse_resume as _parse
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    out = _parse(clean_text, sections)
+    out["parse_method"] = "one_page_ru"
+    return out
diff --git a/extract/templates/pptx_export.py b/extract/templates/pptx_export.py
new file mode 100644
index 0000000..c0c8935
--- /dev/null
+++ b/extract/templates/pptx_export.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    text = clean_text or ""
+    contacts_raw = extract_contacts(text)
+    name = extract_name_guess(text)
+    roles, skills = extract_roles_skills(text)
+    remote = extract_remote(text)
+    english = extract_english(text)
+    location = extract_location_best_effort(text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "pptx_template",
+    }
diff --git a/extract/text_extract.py b/extract/text_extract.py
new file mode 100644
index 0000000..17ed285
--- /dev/null
+++ b/extract/text_extract.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import logging
+from bs4 import BeautifulSoup
+
+try:  # optional dependency for PDF fallback
+    from pypdf import PdfReader as _PdfReader  # type: ignore
+except Exception:  # pragma: no cover - optional import
+    try:
+        from PyPDF2 import PdfReader as _PdfReader  # type: ignore
+    except Exception:  # pragma: no cover
+        _PdfReader = None  # type: ignore
+
+def _read_bytes(path: Path) -> bytes:
+    return path.read_bytes()
+
+def extract_text_from_txt(path: Path) -> str:
+    data = _read_bytes(path)
+    for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
+        try:
+            return data.decode(enc, errors="ignore")
+        except Exception:
+            continue
+    return data.decode("utf-8", errors="ignore")
+
+def extract_text_from_html(path: Path) -> str:
+    html = extract_text_from_txt(path)
+    soup = BeautifulSoup(html, "lxml")
+    return soup.get_text("\n", strip=True)
+
+def extract_text_from_docx(path: Path) -> str:
+    from docx import Document
+    doc = Document(str(path))
+    parts = []
+    for p in doc.paragraphs:
+        if p.text and p.text.strip():
+            parts.append(p.text.strip())
+    for table in doc.tables:
+        for row in table.rows:
+            cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
+            if cells:
+                parts.append(" | ".join(cells))
+    return "\n".join(parts)
+
+_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
+# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
+logging.getLogger("pypdf").setLevel(logging.ERROR)
+logging.getLogger("PyPDF2").setLevel(logging.ERROR)
+
+
+def extract_text_from_pdf(path: Path) -> str:
+    """
+    Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
+    Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
+    """
+    if _PdfReader is None:
+        raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
+
+    try:
+        reader = _PdfReader(str(path), strict=False)
+    except Exception as exc:  # pragma: no cover - pdf parser edge cases
+        raise RuntimeError(f"PDF read failed: {exc}") from exc
+
+    parts = []
+    for idx, page in enumerate(getattr(reader, "pages", [])):
+        if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
+            break
+        try:
+            text = page.extract_text()  # type: ignore[attr-defined]
+        except Exception:
+            text = None
+        if text:
+            parts.append(text)
+    return "\n".join(parts)
+
+def extract_text_from_doc_best_effort(path: Path) -> str:
+    # .doc requires external tools; best-effort if textract installed
+    try:
+        import textract  # type: ignore
+        b = textract.process(str(path))
+        return b.decode("utf-8", errors="ignore")
+    except Exception:
+        return ""
+
+def extract_text(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in (".txt", ".log"):
+        return extract_text_from_txt(path)
+    if ext in (".html", ".htm"):
+        return extract_text_from_html(path)
+    if ext == ".docx":
+        return extract_text_from_docx(path)
+    if ext == ".pdf":
+        return extract_text_from_pdf(path)
+    if ext == ".doc":
+        return extract_text_from_doc_best_effort(path)
+    return ""
diff --git a/importers/file_scan.py b/importers/file_scan.py
new file mode 100644
index 0000000..66a2e8a
--- /dev/null
+++ b/importers/file_scan.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, Iterator
+
+RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
+
+def iter_files(root: Path) -> Iterator[Dict]:
+    for p in root.rglob("*"):
+        if p.is_file() and p.suffix.lower() in RESUME_EXTS:
+            yield {
+                "origin_type": "file_scan",
+                "export_path": str(root),
+                "chat_title": None,
+                "message_id": None,
+                "message_date": None,
+                "message_text": "",
+                "file_path": str(p.resolve()),
+                "original_name": p.name,
+                "extra": {},
+            }
diff --git a/importers/telegram_html.py b/importers/telegram_html.py
new file mode 100644
index 0000000..5336996
--- /dev/null
+++ b/importers/telegram_html.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Dict, Iterator, List, Optional
+
+from bs4 import BeautifulSoup
+
+RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
+
+def find_messages_html(root: Path) -> List[Path]:
+    return [p for p in root.rglob("messages*.html") if p.is_file()]
+
+def iter_artifacts(messages_html: Path) -> Iterator[Dict]:
+    html = messages_html.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(html, "lxml")
+
+    chat_title = None
+    h = soup.find(class_=re.compile(r"page_header", re.I))
+    if h:
+        chat_title = h.get_text(" ", strip=True)
+    chat_title = chat_title or messages_html.parent.name
+
+    for msg in soup.select(".message.default.clearfix, .message"):
+        message_id = msg.get("id") or None
+        date_div = msg.select_one(".date")
+        msg_date = date_div.get("title") if date_div else None
+
+        text_div = msg.select_one(".text")
+        msg_text = text_div.get_text("\n", strip=True) if text_div else ""
+
+        file_path = None
+        original_name = None
+        for a in msg.find_all("a", href=True):
+            href = a["href"]
+            p = (messages_html.parent / href).resolve()
+            if p.exists() and p.suffix.lower() in RESUME_EXTS:
+                file_path = str(p)
+                original_name = p.name
+                break
+
+        if file_path:
+            yield {
+                "origin_type": "telegram_html",
+                "export_path": str(messages_html.parent),
+                "chat_title": chat_title,
+                "message_id": str(message_id) if message_id else None,
+                "message_date": msg_date,
+                "message_text": msg_text or "",
+                "file_path": file_path,
+                "original_name": original_name,
+                "extra": {"html_path": str(messages_html)},
+            }
+        else:
+            if msg_text and len(msg_text.strip()) >= 500:
+                yield {
+                    "origin_type": "message_text",
+                    "export_path": str(messages_html.parent),
+                    "chat_title": chat_title,
+                    "message_id": str(message_id) if message_id else None,
+                    "message_date": msg_date,
+                    "message_text": msg_text,
+                    "file_path": None,
+                    "original_name": None,
+                    "extra": {"html_path": str(messages_html)},
+                }
diff --git a/importers/telegram_json.py b/importers/telegram_json.py
new file mode 100644
index 0000000..5cc9985
--- /dev/null
+++ b/importers/telegram_json.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Dict, Iterator, List, Optional
+
+RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
+
+def find_result_json(root: Path) -> List[Path]:
+    return list(root.rglob("result.json"))
+
+def _text_field_to_str(text_field) -> str:
+    if isinstance(text_field, str):
+        return text_field
+    if isinstance(text_field, list):
+        parts = []
+        for item in text_field:
+            if isinstance(item, str):
+                parts.append(item)
+            elif isinstance(item, dict) and "text" in item:
+                parts.append(str(item["text"]))
+        return "".join(parts)
+    return ""
+
+def iter_artifacts(result_json: Path) -> Iterator[Dict]:
+    data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore"))
+
+    chats = []
+    if isinstance(data, dict):
+        chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or []
+    for chat in chats:
+        chat_title = chat.get("name") or chat.get("title") or "unknown_chat"
+        messages = chat.get("messages", []) or []
+        for msg in messages:
+            msg_id = str(msg.get("id") or "")
+            msg_date = msg.get("date") or msg.get("date_unixtime") or None
+            text = _text_field_to_str(msg.get("text", ""))
+
+            file_rel = msg.get("file") or None
+            file_path = None
+            original_name = None
+            if file_rel:
+                p = (result_json.parent / file_rel).resolve()
+                if p.exists() and p.suffix.lower() in RESUME_EXTS:
+                    file_path = str(p)
+                    original_name = p.name
+
+            if file_path:
+                yield {
+                    "origin_type": "telegram_json",
+                    "export_path": str(result_json.parent),
+                    "chat_title": chat_title,
+                    "message_id": msg_id,
+                    "message_date": str(msg_date) if msg_date is not None else None,
+                    "message_text": text or "",
+                    "file_path": file_path,
+                    "original_name": original_name,
+                    "extra": {"json_path": str(result_json)},
+                }
+            else:
+                # message-only resume paste (heuristic)
+                if text and len(text.strip()) >= 500:
+                    yield {
+                        "origin_type": "message_text",
+                        "export_path": str(result_json.parent),
+                        "chat_title": chat_title,
+                        "message_id": msg_id,
+                        "message_date": str(msg_date) if msg_date is not None else None,
+                        "message_text": text,
+                        "file_path": None,
+                        "original_name": None,
+                        "extra": {"json_path": str(result_json)},
+                    }
diff --git a/normalize.py b/normalize.py
new file mode 100644
index 0000000..ae7d21b
--- /dev/null
+++ b/normalize.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+
+_SKILL_SYNONYMS: Dict[str, List[str]] = {
+    "python": ["py"],
+    "javascript": ["js", "node", "nodejs", "java script", "java-script"],
+    "typescript": ["ts", "type script", "type-script"],
+    "postgresql": ["postgres", "psql"],
+    "kubernetes": ["k8s"],
+    "docker": [],
+    "fastapi": [],
+    "django": ["drf", "django rest framework"],
+    "flask": [],
+    "golang": ["go"],
+    "c++": ["cpp"],
+    "c#": ["csharp"],
+    "redis": [],
+    "kafka": [],
+    "rabbitmq": [],
+    "grpc": [],
+    "rest": [],
+}
+
+_SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
+
+_ROLE_SYNONYMS: Dict[str, List[str]] = {
+    "backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
+    "frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
+    "fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
+    "devops": ["sre", "site reliability"],
+    "qa": ["tester", "тестировщик"],
+    "data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
+    "mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
+}
+
+
+def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
+    alias = {}
+    for canonical, al in src.items():
+        alias[canonical] = canonical
+        for a in al:
+            alias[a] = canonical
+    return {k.lower(): v for k, v in alias.items()}
+
+
+_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
+_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
+
+
+def _normalize_skill_surface(token: str) -> str:
+    t = (token or "").strip().lower()
+    if not t:
+        return ""
+    t = t.replace("/", " ")
+    t = re.sub(r"[_\-]+", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()
+
+    # "java script", "type script", "postgre sql", "graph ql", "g rpc"
+    t = re.sub(r"\bjava\s+script\b", "javascript", t)
+    t = re.sub(r"\btype\s+script\b", "typescript", t)
+    t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
+    t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
+    t = re.sub(r"\bg\s+rpc\b", "grpc", t)
+    t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
+    return t
+
+
+def normalize_skill(token: str) -> Optional[str]:
+    t = _normalize_skill_surface(token)
+    if not t:
+        return None
+
+    # Avoid false-positive java from "javascript"
+    if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
+        return "javascript"
+
+    return _SKILL_ALIAS.get(t, t)
+
+
+def normalize_skills(skills: List[str]) -> List[str]:
+    out: List[str] = []
+    seen = set()
+    for s in skills or []:
+        canon = normalize_skill(s)
+        if not canon or canon in seen:
+            continue
+        seen.add(canon)
+        out.append(canon)
+    return out
+
+
+def normalize_role(token: str) -> Optional[str]:
+    t = (token or "").strip().lower()
+    if not t:
+        return None
+    return _ROLE_ALIAS.get(t, t)
+
+
+def normalize_roles(roles: List[str]) -> List[str]:
+    out: List[str] = []
+    seen = set()
+    for r in roles or []:
+        canon = normalize_role(r)
+        if not canon or canon in seen:
+            continue
+        seen.add(canon)
+        out.append(canon)
+    return out
+
+
+def split_skills_primary_secondary(
+    skills: List[str],
+    *,
+    clean_text: str,
+    sections: Dict[str, str] | None = None,
+    primary_limit: int = 25,
+) -> Tuple[List[str], List[str]]:
+    if not skills:
+        return [], []
+
+    text = (clean_text or "").lower()
+    skills_section = (sections or {}).get("skills", "").lower()
+    experience_section = (sections or {}).get("experience", "").lower()
+
+    scores: Dict[str, float] = {}
+    for sk in skills:
+        s = sk.lower()
+        score = 1.0
+        if s in skills_section:
+            score += 2.2
+        if s in experience_section:
+            score += 1.2
+        count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
+        score += min(2.5, count * 0.5)
+        if s in _SKILL_STOP:
+            score -= 1.5
+        scores[sk] = score
+
+    ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
+    primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
+    secondary = [s for s in ranked if s not in primary]
+    return primary, secondary
+
+
+def normalize_location(raw: Optional[str]) -> Optional[str]:
+    if not raw:
+        return None
+    t = raw.strip()
+    low = t.lower()
+    if low in ("москва", "moscow", "moscow, russia"):
+        return "Moscow, Russia"
+    if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
+        return "Saint Petersburg, Russia"
+    return t
+
+
+def find_skills_in_text(text: str) -> List[str]:
+    if not text:
+        return []
+    found: List[str] = []
+    seen = set()
+    low = _normalize_skill_surface(text)
+    for alias, canon in _SKILL_ALIAS.items():
+        key = _normalize_skill_surface(alias)
+        if key in seen:
+            continue
+        if re.search(r"\b" + re.escape(key) + r"\b", low):
+            if canon not in seen:
+                found.append(canon)
+                seen.add(canon)
+    return found
diff --git a/pdf_merge.py b/pdf_merge.py
new file mode 100644
index 0000000..b2b31af
--- /dev/null
+++ b/pdf_merge.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Iterable, List, Optional
+
+from pypdf import PdfReader, PdfWriter
+
+
+def merge_pdfs(pdf_paths: Iterable[str | Path], out_pdf_path: str | Path) -> dict:
+    out_pdf_path = Path(out_pdf_path)
+    out_pdf_path.parent.mkdir(parents=True, exist_ok=True)
+
+    writer = PdfWriter()
+
+    merged: List[str] = []
+    skipped: List[str] = []
+
+    for p in pdf_paths:
+        path = Path(p)
+        try:
+            reader = PdfReader(str(path))
+            # просто добавляем страницы подряд
+            for page in reader.pages:
+                writer.add_page(page)
+            merged.append(str(path))
+        except Exception:
+            skipped.append(str(path))
+
+    if merged:
+        with out_pdf_path.open("wb") as f:
+            writer.write(f)
+
+    return {
+        "out_pdf": str(out_pdf_path),
+        "merged_count": len(merged),
+        "skipped_count": len(skipped),
+        "merged_files": merged,
+        "skipped_files": skipped,
+    }
+
+
+def merge_all_pdfs_in_dir(files_dir: str | Path, out_pdf_path: str | Path) -> dict:
+    files_dir = Path(files_dir)
+    pdfs = sorted(files_dir.rglob("*.pdf")) + sorted(files_dir.rglob("*.PDF"))
+    return merge_pdfs(pdfs, out_pdf_path)
diff --git a/pipeline.py b/pipeline.py
new file mode 100644
index 0000000..d7f5b86
--- /dev/null
+++ b/pipeline.py
@@ -0,0 +1,1990 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+import shutil
+import sqlite3
+import subprocess
+import uuid
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+from tg_resume_db.util import Logger, utc_iso
+from tg_resume_db.extract.text_extract import extract_text as extract_text_generic
+from tg_resume_db.extract.clean import normalize_text, to_fts_text
+from tg_resume_db.extract.pdf_extract import extract_pdf_best
+from tg_resume_db.extract.llm import (
+    LLMExtraction,
+    llm_extract_profile,
+    llm_parse_enabled,
+    llm_review_profile,
+)
+from tg_resume_db.extract.doc_type import detect_doc_type
+from tg_resume_db.extract.sections import split_sections, sections_present
+from tg_resume_db.extract.experience_timeline import extract_positions, positions_to_dicts
+from tg_resume_db.extract.parse import (
+    extract_contacts as extract_contacts_raw,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,          # Updated function
+    norm_pipe,
+    safe_json,
+)
+from tg_resume_db.extract.templates import generic as tpl_generic
+from tg_resume_db.extract.templates import hh_ru as tpl_hh
+from tg_resume_db.extract.templates import linkedin as tpl_linkedin
+from tg_resume_db.extract.templates import one_page_en as tpl_one_page_en
+from tg_resume_db.extract.templates import one_page_ru as tpl_one_page_ru
+from tg_resume_db.extract.templates import pptx_export as tpl_pptx
+from tg_resume_db.normalize import (
+    normalize_skills,
+    normalize_roles,
+    split_skills_primary_secondary,
+    normalize_location,
+)
+from tg_resume_db.dedup.simhash import (
+    sha256_file,
+    sha1_str,
+    simhash64,
+    simhash_bands,
+    hamming64,
+)
+from tg_resume_db.importers.telegram_json import find_result_json, iter_artifacts as iter_json_artifacts
+from tg_resume_db.importers.telegram_html import find_messages_html, iter_artifacts as iter_html_artifacts
+from tg_resume_db.importers.file_scan import iter_files as iter_file_scan
+
+_PARSE_VERSION = "v3_llm_review"
+
+
+# -----------------------------
+# helpers: make everything text
+# -----------------------------
+
+def coerce_text(x: Any) -> str:
+    """Turn Telegram-export weird structures (dict/list/bytes) into plain text."""
+    if x is None:
+        return ""
+    if isinstance(x, str):
+        return x
+    if isinstance(x, bytes):
+        for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
+            try:
+                return x.decode(enc, errors="ignore")
+            except Exception:
+                pass
+        return x.decode("utf-8", errors="ignore")
+
+    if isinstance(x, list):
+        parts: List[str] = []
+        for item in x:
+            if isinstance(item, dict):
+                parts.append(coerce_text(item.get("text") or item.get("href") or ""))
+            else:
+                parts.append(coerce_text(item))
+        return "".join(parts)
+
+    if isinstance(x, dict):
+        if "text" in x:
+            return coerce_text(x["text"])
+        if "content" in x:
+            return coerce_text(x["content"])
+        return json.dumps(x, ensure_ascii=False)
+
+    return str(x)
+
+
+# -----------------------------
+# PDF extraction: prefer pdftotext
+# -----------------------------
+
+def _which_pdftotext() -> Optional[str]:
+    if os.environ.get("PDFTOTEXT_ENABLE", "0").lower() not in ("1", "true", "yes"):
+        return None
+    exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
+    return exe
+
+
+def extract_text_from_pdf_pdftotext(fp: Path, timeout_sec: int = 25) -> str:
+    exe = _which_pdftotext()
+    if not exe:
+        return ""
+    cmd = [exe, "-layout", "-nopgbrk", str(fp), "-"]
+    try:
+        p = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=timeout_sec,
+            check=False,
+            text=True,
+            encoding="utf-8",
+            errors="ignore",
+        )
+        return (p.stdout or "").strip()
+    except subprocess.TimeoutExpired:
+        return ""
+    except Exception:
+        return ""
+
+
+def extract_text_resilient(fp: Path, log: Optional[Logger] = None, timeout_sec: int = 25) -> str:
+    ext = fp.suffix.lower()
+
+    if ext == ".pdf":
+        out = extract_text_from_pdf_pdftotext(fp, timeout_sec=timeout_sec)
+        if out:
+            return out
+        try:
+            return extract_text_generic(fp) or ""
+        except Exception as e:
+            if log:
+                log.warn("[extract] pdf failed - skipped", {"file": str(fp), "err": repr(e)})
+            return ""
+
+    try:
+        return extract_text_generic(fp) or ""
+    except Exception as e:
+        if log:
+            log.warn("[extract] file failed - skipped", {"file": str(fp), "err": repr(e)})
+        return ""
+
+
+# -----------------------------
+# contacts normalization + phone/tg cleanup
+# -----------------------------
+
+_EMAIL_RE = re.compile(r"\b[a-zA-Z0-9._%+\-]{1,64}@[a-zA-Z0-9.\-]{1,253}\.[a-zA-Z]{2,}\b")
+_EMAIL_SPLIT_RE = re.compile(
+    r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
+    r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
+    re.I,
+)
+_TG_AT_RE = re.compile(r"(?<![\w@])@([A-Za-z0-9_]{5,32})(?!\w)")
+_TG_LINK_RE = re.compile(r"(?:https?://)?(?:t\.me|telegram\.me)/([A-Za-z0-9_]{5,32})(?!\w)")
+_PHONE_CHUNK_RE = re.compile(r"(?<!\d)(\+?\d[\d()\-\s.]{7,}\d)(?!\d)")
+
+_TG_STOP = {
+    "gmail", "email", "mail", "phone", "telegram", "linkedin", "github", "resume", "cv",
+    "backend", "frontend", "fullstack", "ios", "android", "qa",
+    "senior", "middle", "junior", "lead", "teamlead", "developer",
+    "community", "release"
+}
+
+_EMAIL_PREFIX_STOP = {
+    "email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
+}
+
+
+def _norm_email(s: str) -> Optional[str]:
+    s = s.strip().lower()
+    if _EMAIL_RE.fullmatch(s):
+        return s
+    return None
+
+
+def _recover_split_emails(text: str) -> List[str]:
+    out: List[str] = []
+    for m in _EMAIL_SPLIT_RE.finditer(text or ""):
+        prefix = (m.group("prefix") or "").strip().lower().strip(".-_")
+        if not prefix or prefix in _EMAIL_PREFIX_STOP:
+            continue
+        if not re.search(r"[._\-\d]", prefix):
+            continue
+        tail = (m.group("tail") or "").strip().lower()
+        if "@" not in tail:
+            continue
+        local_tail, domain = tail.split("@", 1)
+        local = f"{prefix}{local_tail}"
+        if len(local) > 64:
+            continue
+        cand = f"{local}@{domain}"
+        if _EMAIL_RE.fullmatch(cand):
+            out.append(cand)
+    return out
+
+
+def _prune_fragment_emails(values: List[str]) -> List[str]:
+    uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
+    out: List[str] = []
+    for e in uniq:
+        local, domain = e.split("@", 1)
+        drop = False
+        for other in uniq:
+            if other == e:
+                continue
+            ol, od = other.split("@", 1)
+            if od != domain:
+                continue
+            if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
+                drop = True
+                break
+        if not drop:
+            out.append(e)
+    return out
+
+
+def _looks_like_month_range(digits: str) -> bool:
+    if len(digits) == 12:
+        try:
+            mm1 = int(digits[0:2]); yyyy1 = int(digits[2:6])
+            mm2 = int(digits[6:8]); yyyy2 = int(digits[8:12])
+            if 1 <= mm1 <= 12 and 1900 <= yyyy1 <= 2100 and 1 <= mm2 <= 12 and 1900 <= yyyy2 <= 2100:
+                return True
+        except Exception:
+            return False
+    return False
+
+
+def _norm_phone(s: str) -> Optional[str]:
+    raw = s.strip()
+    if not (raw.startswith("+") or raw.startswith("7") or raw.startswith("8")):
+        return None
+
+    digits = re.sub(r"\D+", "", raw)
+    if len(digits) < 10 or len(digits) > 15:
+        return None
+
+    if len(set(digits)) <= 2:
+        return None
+
+    if _looks_like_month_range(digits):
+        return None
+
+    if len(digits) == 12 and digits.startswith(("2", "3", "4", "5", "6", "7", "8", "9")):
+        if digits.count("0") >= 6:
+            return None
+
+    return "+" + digits
+
+
+def _norm_tg_handle(handle: str) -> Optional[str]:
+    h = handle.strip().lstrip("@").lower()
+    if not (5 <= len(h) <= 32):
+        return None
+    if not re.fullmatch(r"[a-z0-9_]+", h):
+        return None
+    if h.isdigit():
+        return None
+    if h in _TG_STOP:
+        return None
+    return h
+
+
+def normalize_contacts(raw: Any, clean_text: str) -> Dict[str, List[str]]:
+    out: Dict[str, List[str]] = {"email": [], "phone": [], "tg": [], "github": [], "linkedin": []}
+
+    if isinstance(raw, dict):
+        key_map = {
+            "emails": "email", "email": "email",
+            "phones": "phone", "phone": "phone",
+            "telegram": "tg", "tg": "tg",
+            "github": "github",
+            "linkedin": "linkedin",
+        }
+        for k, v in raw.items():
+            nk = key_map.get(k)
+            if not nk:
+                continue
+            vals = [coerce_text(x) for x in v] if isinstance(v, list) else [coerce_text(v)]
+            out[nk].extend(vals)
+
+    for e in _EMAIL_RE.findall(clean_text):
+        out["email"].append(e)
+    for e in _recover_split_emails(clean_text):
+        out["email"].append(e)
+
+    for chunk in _PHONE_CHUNK_RE.findall(clean_text):
+        out["phone"].append(chunk)
+
+    for h in _TG_AT_RE.findall(clean_text):
+        out["tg"].append(h)
+    for h in _TG_LINK_RE.findall(clean_text):
+        out["tg"].append(h)
+
+    def uniq(seq: List[str]) -> List[str]:
+        seen = set()
+        res = []
+        for x in seq:
+            if x in seen:
+                continue
+            seen.add(x)
+            res.append(x)
+        return res
+
+    emails: List[str] = []
+    for s in out["email"]:
+        n = _norm_email(s)
+        if n:
+            emails.append(n)
+
+    phones: List[str] = []
+    for s in out["phone"]:
+        n = _norm_phone(s)
+        if n:
+            phones.append(n)
+
+    tgs: List[str] = []
+    for s in out["tg"]:
+        n = _norm_tg_handle(s)
+        if n:
+            tgs.append(n)
+
+    out["email"] = uniq(_prune_fragment_emails(emails))
+    out["phone"] = uniq(phones)
+    out["tg"] = uniq(tgs)
+
+    out["github"] = uniq([coerce_text(x).strip() for x in out["github"] if coerce_text(x).strip()])
+    out["linkedin"] = uniq([coerce_text(x).strip() for x in out["linkedin"] if coerce_text(x).strip()])
+
+    return out
+
+
+# -----------------------------
+# LLM helpers
+# -----------------------------
+
+_LANGUAGE_CANON = {
+    "python",
+    "java",
+    "kotlin",
+    "go",
+    "golang",
+    "c++",
+    "cpp",
+    "c#",
+    "javascript",
+    "typescript",
+    "ruby",
+    "php",
+    "swift",
+    "objective-c",
+    "scala",
+    "rust",
+    "dart",
+}
+
+_LANGUAGE_ALIAS = {
+    "golang": "go",
+    "cpp": "c++",
+    "c plus plus": "c++",
+    "csharp": "c#",
+    "c#": "c#",
+    "js": "javascript",
+    "ts": "typescript",
+}
+
+
+_JAVA_REAL_RE = re.compile(r"\b(java\s*(8|11|17|21)|spring|jvm|maven|gradle|jakarta)\b", re.I)
+_JAVASCRIPT_RE = re.compile(r"\b(java\s*script|javascript|js)\b", re.I)
+
+
+def _norm_lang_token(token: str) -> Optional[str]:
+    raw = (token or "").strip().lower()
+    if not raw:
+        return None
+    norm = _LANGUAGE_ALIAS.get(raw, raw)
+    if norm in _LANGUAGE_CANON:
+        # collapse golang -> go, cpp -> c++
+        if norm == "golang":
+            norm = "go"
+        if norm == "cpp":
+            norm = "c++"
+        return norm
+    return None
+
+
+def _normalize_language_list(values: List[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for v in values or []:
+        tok = _norm_lang_token(v)
+        if not tok or tok in seen:
+            continue
+        seen.add(tok)
+        out.append(tok)
+    return out
+
+
+def _drop_false_java(
+    skills: List[str],
+    primary_languages: List[str],
+    clean_text: str,
+) -> Tuple[List[str], List[str]]:
+    norm_skills = [str(s).strip().lower() for s in (skills or [])]
+    if "java" not in norm_skills:
+        return skills, primary_languages
+
+    txt = clean_text or ""
+    has_js = _JAVASCRIPT_RE.search(txt) is not None
+    has_real_java = _JAVA_REAL_RE.search(txt) is not None
+    if has_js and not has_real_java:
+        cleaned_skills = [s for s in skills if str(s).strip().lower() != "java"]
+        cleaned_langs = [s for s in primary_languages if str(s).strip().lower() != "java"]
+        return cleaned_skills, cleaned_langs
+    return skills, primary_languages
+
+
+def _roles_from_desired_title(title: Optional[str]) -> List[str]:
+    if not title:
+        return []
+    t = title.lower()
+    out: List[str] = []
+    if "backend" in t or "бэкенд" in t or "бекенд" in t:
+        out.append("backend")
+    if "frontend" in t or "фронтенд" in t:
+        out.append("frontend")
+    if "fullstack" in t or "full stack" in t or "фулстек" in t:
+        out.append("fullstack")
+    if "devops" in t or "sre" in t:
+        out.append("devops")
+    if "qa" in t or "test" in t or "тестировщик" in t:
+        out.append("qa")
+    if "data" in t or "ml" in t or "machine learning" in t or "аналитик" in t:
+        out.append("data")
+    if "android" in t or "ios" in t or "mobile" in t or "мобиль" in t:
+        out.append("mobile")
+    return out
+
+
+def _merge_lists(base: List[str], extra: List[str], limit: Optional[int] = None) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for seq in (base or [], extra or []):
+        for x in seq:
+            t = str(x).strip()
+            if not t or t.lower() in seen:
+                continue
+            seen.add(t.lower())
+            out.append(t)
+            if limit is not None and len(out) >= limit:
+                return out
+    return out
+
+
+def _pick_salary(
+    heur_min: Optional[int],
+    heur_max: Optional[int],
+    heur_conf: Optional[float],
+    llm_min: Optional[int],
+    llm_max: Optional[int],
+) -> Tuple[Optional[int], Optional[int], Optional[float]]:
+    if heur_min or heur_max:
+        if heur_conf is None:
+            heur_conf = 0.55
+        return heur_min, heur_max, heur_conf
+
+    if llm_min or llm_max:
+        return llm_min, llm_max, 0.65
+
+    return heur_min, heur_max, heur_conf
+
+
+_EN_SIGNAL_RE = re.compile(r"\b(english|англий|ielts|toefl|cefr|a1|a2|b1|b2|c1|c2)\b", re.I)
+
+
+def _has_english_signal(text: str) -> bool:
+    if not text:
+        return False
+    return _EN_SIGNAL_RE.search(text) is not None
+
+
+def _can_accept_llm_english(clean_text: str, level: Optional[str]) -> bool:
+    if not level:
+        return False
+    # Require explicit language signal in CV to avoid invented C1/C2.
+    return _has_english_signal(clean_text)
+
+
+_ROLE_EVIDENCE_PATTERNS: Dict[str, re.Pattern] = {
+    "qa": re.compile(r"\b(qa|quality assurance|tester|test engineer|test automation)\b", re.I),
+    "devops": re.compile(r"\b(devops|dev ops|sre|platform engineer|infrastructure engineer)\b", re.I),
+    "mobile": re.compile(r"\b(mobile|android|ios|react native|flutter)\b", re.I),
+    "data": re.compile(r"\b(data engineer|data scientist|ml engineer|machine learning)\b", re.I),
+    "architect": re.compile(r"\b(architect|solution architect|software architect)\b", re.I),
+}
+
+
+def _prune_roles_by_evidence(roles: List[str], clean_text: str) -> List[str]:
+    out: List[str] = []
+    seen = set()
+    t = (clean_text or "").lower()
+    for role in roles or []:
+        r = str(role).strip().lower()
+        if not r or r in seen:
+            continue
+        seen.add(r)
+        pat = _ROLE_EVIDENCE_PATTERNS.get(r)
+        if pat is not None and not pat.search(t):
+            continue
+        out.append(r)
+    return out
+
+
+def _parse_ym(date_iso: Optional[str]) -> Optional[Tuple[int, int]]:
+    if not date_iso:
+        return None
+    m = re.match(r"^\s*(\d{4})-(\d{2})", str(date_iso).strip())
+    if not m:
+        return None
+    y = int(m.group(1))
+    mm = int(m.group(2))
+    if not (1900 <= y <= 2100 and 1 <= mm <= 12):
+        return None
+    return (y, mm)
+
+
+def _months_between(a: Tuple[int, int], b: Tuple[int, int]) -> int:
+    return (b[0] - a[0]) * 12 + (b[1] - a[1])
+
+
+def _experience_years_from_positions(position_dicts: List[Dict[str, Any]]) -> Optional[float]:
+    intervals: List[Tuple[Tuple[int, int], Tuple[int, int]]] = []
+    for p in position_dicts or []:
+        if not isinstance(p, dict):
+            continue
+        a = _parse_ym(p.get("date_from"))
+        b = _parse_ym(p.get("date_to"))
+        if not a or not b:
+            continue
+        if b < a:
+            a, b = b, a
+        intervals.append((a, b))
+
+    if not intervals:
+        return None
+
+    intervals.sort(key=lambda x: x[0])
+    merged: List[Tuple[Tuple[int, int], Tuple[int, int]]] = [intervals[0]]
+    for s, e in intervals[1:]:
+        ls, le = merged[-1]
+        if s <= le:
+            if e > le:
+                merged[-1] = (ls, e)
+        else:
+            merged.append((s, e))
+
+    months = 0
+    for s, e in merged:
+        months += max(0, _months_between(s, e))
+    years = round(months / 12.0, 2)
+    if 0.0 <= years <= 60.0:
+        return years
+    return None
+
+
+def _reconcile_experience_fields(
+    *,
+    exp_years: Optional[float],
+    exp_years_eng: Optional[float],
+    exp_conf: Optional[float],
+    exp_dbg: Dict[str, Any],
+    positions: List[Dict[str, Any]],
+) -> Tuple[Optional[float], Optional[float], Optional[float], Dict[str, Any]]:
+    dbg = dict(exp_dbg or {})
+    source_notes: List[str] = []
+
+    pos_years = _experience_years_from_positions(positions)
+    if pos_years is not None:
+        dbg["positions_years"] = pos_years
+
+    if exp_years is None and pos_years is not None:
+        exp_years = pos_years
+        exp_conf = max(float(exp_conf or 0.0), 0.74)
+        source_notes.append("positions_fallback")
+    elif exp_years is not None and pos_years is not None and pos_years > (float(exp_years) + 1.0):
+        method = str(dbg.get("method") or "")
+        strong_summary = method in ("summary", "header_chunk") and float(exp_conf or 0.0) >= 0.78
+        if strong_summary and (pos_years - float(exp_years)) > 1.5:
+            source_notes.append("positions_reconcile_skip_strong_summary")
+        else:
+            exp_years = pos_years
+            exp_conf = max(float(exp_conf or 0.0), 0.75)
+            source_notes.append("positions_reconcile_up")
+
+    # Prevent impossible split like total=1.5 while engineering=7.0.
+    try:
+        if exp_years is not None and exp_years_eng is not None:
+            if float(exp_years) < float(exp_years_eng) * 0.7:
+                exp_years = float(exp_years_eng)
+                exp_conf = max(float(exp_conf or 0.0), 0.74)
+                source_notes.append("eng_gt_total_fix")
+    except Exception:
+        pass
+
+    is_recruiter = bool(dbg.get("is_recruiter"))
+    if exp_years_eng is None and exp_years is not None and not is_recruiter:
+        exp_years_eng = float(exp_years)
+        source_notes.append("eng_fill_from_total")
+
+    if source_notes:
+        dbg["reconcile"] = source_notes
+    return exp_years, exp_years_eng, exp_conf, dbg
+
+
+def _prefer_explicit_summary_experience(
+    *,
+    clean_text: str,
+    exp_years: Optional[float],
+    exp_years_eng: Optional[float],
+    exp_conf: Optional[float],
+    exp_dbg: Dict[str, Any],
+) -> Tuple[Optional[float], Optional[float], Optional[float], Dict[str, Any]]:
+    try:
+        clean_total, clean_eng, clean_conf, clean_dbg = extract_experience_years(clean_text or "")
+    except Exception:
+        return exp_years, exp_years_eng, exp_conf, exp_dbg
+
+    if clean_total is None:
+        return exp_years, exp_years_eng, exp_conf, exp_dbg
+
+    if exp_years is None:
+        merged_dbg = dict(exp_dbg or {})
+        merged_dbg["clean_exp_method"] = (clean_dbg or {}).get("method")
+        return clean_total, (clean_eng if clean_eng is not None else exp_years_eng), max(float(exp_conf or 0.0), float(clean_conf or 0.0)), merged_dbg
+
+    parsed_method = str((exp_dbg or {}).get("method") or "")
+    clean_method = str((clean_dbg or {}).get("method") or "")
+    if clean_conf is not None and clean_conf >= 0.78 and clean_method in ("summary", "header_chunk"):
+        try:
+            if parsed_method.startswith("timeline") and float(clean_total) + 1.5 < float(exp_years):
+                merged_dbg = dict(exp_dbg or {})
+                merged_dbg["clean_exp_method"] = clean_method
+                merged_dbg["reconcile_clean"] = "prefer_explicit_summary"
+                return clean_total, (clean_eng if clean_eng is not None else exp_years_eng), max(float(exp_conf or 0.0), float(clean_conf or 0.0)), merged_dbg
+        except Exception:
+            pass
+
+    return exp_years, exp_years_eng, exp_conf, exp_dbg
+
+
+def _need_llm_fallback(
+    *,
+    roles: List[str],
+    skills: List[str],
+    exp_conf: Optional[float],
+    english: Optional[str],
+    location: Optional[str],
+    name: Optional[str],
+    doc_type: Optional[str],
+) -> bool:
+    if doc_type == "scan_pdf":
+        return False
+    if not name:
+        return True
+    if not roles and len(skills) < 2:
+        return True
+    if exp_conf is None or exp_conf < 0.4:
+        return True
+    if not english and not location and len(skills) < 2:
+        return True
+    return False
+
+
+def _maybe_llm_enrich(
+    *,
+    con: sqlite3.Connection,
+    clean: str,
+    roles: List[str],
+    skills: List[str],
+    exp_conf: Optional[float],
+    english: Optional[str],
+    location: Optional[str],
+    name: Optional[str],
+    doc_type: Optional[str],
+    sections: Optional[Dict[str, str]],
+) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
+    """
+    LLM runs only as fallback when heuristics are weak,
+    unless forced via LLM_PARSE_FORCE=1.
+    """
+    if not llm_parse_enabled():
+        return None, {"enabled": False}
+
+    forced = os.environ.get("LLM_PARSE_FORCE", "0").lower() in ("1", "true", "yes")
+    if not forced and not _need_llm_fallback(
+        roles=roles,
+        skills=skills,
+        exp_conf=exp_conf,
+        english=english,
+        location=location,
+        name=name,
+        doc_type=doc_type,
+    ):
+        return None, {"enabled": True, "forced": False, "used": False, "reason": "heuristics_ok"}
+
+    llm_res, llm_dbg = llm_extract_profile(
+        clean,
+        con=con,
+        doc_type=doc_type,
+        sections=sections,
+    )
+    if isinstance(llm_dbg, dict):
+        llm_dbg["forced"] = forced
+        llm_dbg["used"] = bool(llm_res)
+    return llm_res, llm_dbg
+
+
+_EN_ORDER = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}
+
+
+def _llm_review_mode() -> str:
+    mode = (os.environ.get("LLM_PARSE_REVIEW_MODE", "always") or "").strip().lower()
+    if mode in ("0", "false", "no", "off"):
+        return "off"
+    if mode in ("auto", "smart", "on_demand"):
+        return "auto"
+    return "always"
+
+
+def _llm_review_rounds() -> int:
+    raw = (os.environ.get("LLM_PARSE_REVIEW_ROUNDS", "1") or "").strip()
+    try:
+        rounds = int(raw)
+    except Exception:
+        rounds = 1
+    return max(1, min(rounds, 3))
+
+
+def _normalize_cefr(level: Optional[str]) -> Optional[str]:
+    if not level:
+        return None
+    m = re.search(r"\b(A1|A2|B1|B2|C1|C2)\b", str(level).upper())
+    return m.group(1) if m else None
+
+
+def _bounded_float(v: Any, lo: float, hi: float) -> Optional[float]:
+    try:
+        x = float(v)
+    except Exception:
+        return None
+    if x < lo or x > hi:
+        return None
+    return float(round(x, 2))
+
+
+def _bounded_int(v: Any, lo: int, hi: int) -> Optional[int]:
+    try:
+        x = int(float(v))
+    except Exception:
+        return None
+    if x < lo or x > hi:
+        return None
+    return x
+
+
+def _llm_review_needed(
+    *,
+    mode: str,
+    llm_enriched_used: bool,
+    name: Optional[str],
+    roles: List[str],
+    skills: List[str],
+    exp_conf: Optional[float],
+    english: Optional[str],
+    location: Optional[str],
+) -> bool:
+    if mode == "off":
+        return False
+    if mode == "always":
+        return True
+
+    if llm_enriched_used:
+        return True
+    if not name:
+        return True
+    if not roles or len(skills) < 3:
+        return True
+    if exp_conf is None or exp_conf < 0.65:
+        return True
+    if not english or not location:
+        return True
+    return False
+
+
+def _build_llm_review_draft(
+    *,
+    roles: List[str],
+    skills: List[str],
+    primary_languages: List[str],
+    seniority: Optional[str],
+    backend_focus: Optional[bool],
+    exp_years: Optional[float],
+    exp_years_eng: Optional[float],
+    english: Optional[str],
+    location: Optional[str],
+    remote: Optional[bool],
+    sal_min: Optional[int],
+    sal_max: Optional[int],
+    highlights: List[str],
+    keywords: List[str],
+) -> Dict[str, Any]:
+    return {
+        "roles": roles[:12],
+        "skills": skills[:64],
+        "primary_languages": primary_languages[:12],
+        "seniority": seniority,
+        "backend_focus": backend_focus,
+        "experience_years_total": exp_years,
+        "experience_years_engineering": exp_years_eng,
+        "english_level": _normalize_cefr(english),
+        "location": location,
+        "remote_ok": remote,
+        "salary_min_rub": sal_min,
+        "salary_max_rub": sal_max,
+        "highlights": highlights[:8],
+        "keywords": keywords[:40],
+    }
+
+
+def _merge_review_result(
+    *,
+    review: LLMExtraction,
+    review_dbg: Dict[str, Any],
+    roles: List[str],
+    skills: List[str],
+    primary_languages: List[str],
+    seniority: Optional[str],
+    backend_focus: Optional[bool],
+    remote: Optional[bool],
+    location: Optional[str],
+    english: Optional[str],
+    exp_years: Optional[float],
+    exp_years_eng: Optional[float],
+    exp_conf: Optional[float],
+    sal_min: Optional[int],
+    sal_max: Optional[int],
+    sal_conf: Optional[float],
+    highlights: List[str],
+    keywords: List[str],
+    llm_summary: Optional[str],
+    llm_tags: List[str],
+) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    quality = review_dbg.get("quality_score")
+    try:
+        quality_f = float(quality) if quality is not None else None
+    except Exception:
+        quality_f = None
+    trusted = quality_f is None or quality_f >= 0.55
+
+    changed: List[str] = []
+    model_changed_raw = review_dbg.get("model_changed_fields") or []
+    model_changed = set()
+    if isinstance(model_changed_raw, list):
+        for x in model_changed_raw:
+            s = str(x).strip()
+            if s:
+                model_changed.add(s)
+
+    roles_out = list(roles or [])
+    if review.roles:
+        if trusted and "roles" in model_changed:
+            merged_roles = _merge_lists(review.roles, [], limit=12)
+        else:
+            merged_roles = _merge_lists(review.roles, roles_out, limit=12) if trusted else _merge_lists(roles_out, review.roles, limit=12)
+        if merged_roles != roles_out:
+            changed.append("roles")
+            roles_out = merged_roles
+
+    skills_out = list(skills or [])
+    if review.skills:
+        merged_skills = _merge_lists(review.skills, skills_out, limit=64) if trusted else _merge_lists(skills_out, review.skills, limit=64)
+        if merged_skills != skills_out:
+            changed.append("skills")
+            skills_out = merged_skills
+
+    langs_out = list(primary_languages or [])
+    review_langs = _normalize_language_list(review.primary_languages)
+    if review_langs:
+        if trusted and "primary_languages" in model_changed:
+            merged_langs = _merge_lists(review_langs, [], limit=12)
+        else:
+            merged_langs = _merge_lists(review_langs, langs_out, limit=12) if trusted else _merge_lists(langs_out, review_langs, limit=12)
+        if merged_langs != langs_out:
+            changed.append("primary_languages")
+            langs_out = merged_langs
+
+    seniority_out = seniority
+    if review.seniority and (trusted or not seniority_out):
+        if review.seniority != seniority_out:
+            changed.append("seniority")
+            seniority_out = review.seniority
+
+    backend_focus_out = backend_focus
+    if review.backend_focus is not None and (trusted or backend_focus_out is None):
+        if review.backend_focus != backend_focus_out:
+            changed.append("backend_focus")
+            backend_focus_out = review.backend_focus
+
+    remote_out = remote
+    if review.remote_ok is not None and (trusted or remote_out is None):
+        if review.remote_ok != remote_out:
+            changed.append("remote")
+            remote_out = review.remote_ok
+
+    location_out = location
+    if review.location and (trusted or not location_out):
+        loc = review.location.strip()
+        if 2 <= len(loc) <= 120 and loc != (location_out or ""):
+            changed.append("location")
+            location_out = loc
+
+    english_out = _normalize_cefr(english)
+    review_english = _normalize_cefr(review.english_level)
+    if review_english:
+        if english_out is None:
+            english_out = review_english
+            changed.append("english")
+        elif trusted and _EN_ORDER.get(review_english, 0) > _EN_ORDER.get(english_out, 0):
+            english_out = review_english
+            changed.append("english")
+
+    exp_years_out = exp_years
+    exp_years_eng_out = exp_years_eng
+    exp_conf_out = exp_conf
+    review_exp_total = _bounded_float(review.experience_years_total, 0.0, 60.0)
+    review_exp_eng = _bounded_float(review.experience_years_engineering, 0.0, 60.0)
+    if review_exp_total is not None:
+        if exp_years_out is None or (trusted and ((exp_conf_out or 0.0) < 0.75)):
+            if exp_years_out != review_exp_total:
+                changed.append("experience_years_total")
+                exp_years_out = review_exp_total
+            exp_conf_out = max(float(exp_conf_out or 0.0), 0.78 if trusted else 0.65)
+    if review_exp_eng is not None:
+        if exp_years_eng_out is None or trusted:
+            if exp_years_eng_out != review_exp_eng:
+                changed.append("experience_years_engineering")
+                exp_years_eng_out = review_exp_eng
+            exp_conf_out = max(float(exp_conf_out or 0.0), 0.74 if trusted else 0.62)
+
+    sal_min_out = sal_min
+    sal_max_out = sal_max
+    sal_conf_out = sal_conf
+    cand_min = _bounded_int(review.salary_min_rub, 10_000, 200_000_000)
+    cand_max = _bounded_int(review.salary_max_rub, 10_000, 200_000_000)
+    if cand_min is None and cand_max is None:
+        cand_min = _bounded_int(review.salary_min_usd, 100, 2_000_000)
+        cand_max = _bounded_int(review.salary_max_usd, 100, 2_000_000)
+    if cand_min is not None or cand_max is not None:
+        if cand_min is not None and cand_max is not None and cand_min > cand_max:
+            cand_min, cand_max = cand_max, cand_min
+        if (sal_min_out is None and sal_max_out is None) or (trusted and (sal_conf_out is None or sal_conf_out < 0.75)):
+            if cand_min is not None and cand_min != sal_min_out:
+                sal_min_out = cand_min
+                changed.append("salary")
+            if cand_max is not None and cand_max != sal_max_out:
+                sal_max_out = cand_max
+                changed.append("salary")
+            sal_conf_out = max(float(sal_conf_out or 0.0), 0.72 if trusted else 0.60)
+
+    highlights_out = list(highlights or [])
+    if review.highlights:
+        merged_highlights = _merge_lists(review.highlights, highlights_out, limit=8) if trusted else _merge_lists(highlights_out, review.highlights, limit=8)
+        if merged_highlights != highlights_out:
+            highlights_out = merged_highlights
+            changed.append("highlights")
+
+    keywords_out = list(keywords or [])
+    if review.keywords:
+        merged_keywords = _merge_lists(review.keywords, keywords_out, limit=40) if trusted else _merge_lists(keywords_out, review.keywords, limit=40)
+        if merged_keywords != keywords_out:
+            keywords_out = merged_keywords
+            changed.append("keywords")
+
+    llm_tags_out = list(llm_tags or [])
+    llm_tags_out = _merge_lists(keywords_out, llm_tags_out, limit=40)
+    llm_tags_out = _merge_lists(skills_out, llm_tags_out, limit=40)
+    llm_tags_out = _merge_lists(langs_out, llm_tags_out, limit=40)
+
+    llm_summary_out = llm_summary
+    if highlights_out:
+        merged_summary = "; ".join([h.strip() for h in highlights_out if h.strip()])[:800]
+        if merged_summary and merged_summary != (llm_summary_out or ""):
+            llm_summary_out = merged_summary
+            changed.append("llm_summary")
+
+    changed_uniq = []
+    changed_seen = set()
+    for item in changed:
+        if item in changed_seen:
+            continue
+        changed_seen.add(item)
+        changed_uniq.append(item)
+
+    return (
+        {
+            "roles": roles_out,
+            "skills": skills_out,
+            "primary_languages": langs_out,
+            "seniority": seniority_out,
+            "backend_focus": backend_focus_out,
+            "remote": remote_out,
+            "location": location_out,
+            "english": english_out,
+            "exp_years": exp_years_out,
+            "exp_years_eng": exp_years_eng_out,
+            "exp_conf": exp_conf_out,
+            "sal_min": sal_min_out,
+            "sal_max": sal_max_out,
+            "sal_conf": sal_conf_out,
+            "highlights": highlights_out,
+            "keywords": keywords_out,
+            "llm_summary": llm_summary_out,
+            "llm_tags": llm_tags_out,
+        },
+        {
+            "trusted": trusted,
+            "quality_score": quality_f,
+            "changed_fields": changed_uniq,
+            "issues_found": review_dbg.get("issues_found") or [],
+            "model_changed_fields": review_dbg.get("changed_fields") or [],
+        },
+    )
+
+
+# -----------------------------
+# candidate/resume DB helpers
+# -----------------------------
+
+def stable_candidate_id(contacts: Dict[str, List[str]], name: Optional[str], simh: int) -> str:
+    if contacts.get("email"):
+        return "cand_" + sha1_str("email:" + contacts["email"][0])
+    if contacts.get("phone"):
+        return "cand_" + sha1_str("phone:" + contacts["phone"][0])
+    if contacts.get("tg"):
+        return "cand_" + sha1_str("tg:" + contacts["tg"][0])
+    if contacts.get("github"):
+        return "cand_" + sha1_str("gh:" + contacts["github"][0])
+    if contacts.get("linkedin"):
+        return "cand_" + sha1_str("li:" + contacts["linkedin"][0])
+    base = (name or "unknown").strip().lower()
+    return "cand_" + sha1_str(f"name:{base}:{simh}")
+
+
+def _candidate_by_contact(con: sqlite3.Connection, contacts: Dict[str, List[str]]) -> Optional[str]:
+    checks = [
+        ("email", contacts.get("email", [])),
+        ("phone", contacts.get("phone", [])),
+        ("tg", contacts.get("tg", [])),
+        ("github", contacts.get("github", [])),
+        ("linkedin", contacts.get("linkedin", [])),
+    ]
+    for ctype, vals in checks:
+        for v in vals:
+            row = con.execute(
+                "SELECT candidate_id FROM candidate_contacts WHERE contact_type=? AND contact_value=?",
+                (ctype, v),
+            ).fetchone()
+            if row:
+                return row["candidate_id"]
+    return None
+
+
+def _upsert_contacts(con: sqlite3.Connection, candidate_id: str, contacts: Dict[str, List[str]]) -> None:
+    pairs: List[Tuple[str, str]] = []
+    for e in contacts.get("email", []):
+        pairs.append(("email", e))
+    for p in contacts.get("phone", []):
+        pairs.append(("phone", p))
+    for t in contacts.get("tg", []):
+        pairs.append(("tg", t))
+    for g in contacts.get("github", []):
+        pairs.append(("github", g))
+    for l in contacts.get("linkedin", []):
+        pairs.append(("linkedin", l))
+
+    for ctype, val in pairs:
+        con.execute(
+            "INSERT OR IGNORE INTO candidate_contacts(contact_type, contact_value, candidate_id) VALUES (?,?,?)",
+            (ctype, val, candidate_id),
+        )
+
+
+def _upsert_candidate_skills(
+    con: sqlite3.Connection,
+    candidate_id: str,
+    skills_primary: List[str],
+    skills_secondary: List[str],
+    source: str,
+) -> None:
+    for sk in skills_primary:
+        con.execute(
+            """INSERT OR REPLACE INTO candidate_skills(candidate_id, skill_id, skill_label, confidence, source, evidence)
+               VALUES (?,?,?,?,?,?)""",
+            (candidate_id, sk, sk, 0.90, source, "skills_primary"),
+        )
+    for sk in skills_secondary:
+        con.execute(
+            """INSERT OR REPLACE INTO candidate_skills(candidate_id, skill_id, skill_label, confidence, source, evidence)
+               VALUES (?,?,?,?,?,?)""",
+            (candidate_id, sk, sk, 0.60, source, "skills_secondary"),
+        )
+
+
+def _upsert_candidate_roles(
+    con: sqlite3.Connection,
+    candidate_id: str,
+    roles: List[str],
+    source: str,
+) -> None:
+    for r in roles:
+        con.execute(
+            """INSERT OR REPLACE INTO candidate_roles(candidate_id, role, confidence, source, evidence)
+               VALUES (?,?,?,?,?)""",
+            (candidate_id, r, 0.80, source, "roles"),
+        )
+
+
+def _upsert_candidate_languages(
+    con: sqlite3.Connection,
+    candidate_id: str,
+    english_level: Optional[str],
+    source: str,
+) -> None:
+    if not english_level:
+        return
+    con.execute(
+        """INSERT OR REPLACE INTO candidate_languages(candidate_id, language, level, confidence, source, evidence)
+           VALUES (?,?,?,?,?,?)""",
+        (candidate_id, "english", english_level, 0.75, source, "english_level"),
+    )
+
+
+def _ensure_candidate(con: sqlite3.Connection, candidate_id: str, fields: Dict[str, Any]) -> None:
+    # Attempt to ensure the new column exists if migration didn't run
+    try:
+        con.execute("ALTER TABLE candidates ADD COLUMN experience_years_eng REAL")
+    except Exception:
+        pass  # Column likely exists or basic sqlite error, proceed to insert
+    try:
+        con.execute("ALTER TABLE candidates ADD COLUMN primary_languages_json TEXT")
+    except Exception:
+        pass
+    try:
+        con.execute("ALTER TABLE candidates ADD COLUMN backend_focus INTEGER")
+    except Exception:
+        pass
+
+    exists = con.execute("SELECT 1 FROM candidates WHERE candidate_id=?", (candidate_id,)).fetchone() is not None
+
+    primary_languages_json = safe_json(fields.get("primary_languages", []))
+    backend_focus_field = fields.get("backend_focus")
+    backend_focus_int = None if backend_focus_field is None else (1 if backend_focus_field else 0)
+
+    if not exists:
+        con.execute(
+            """INSERT INTO candidates(
+                 candidate_id, name, location, remote,
+                 experience_years, experience_years_eng, experience_confidence,
+                 salary_min, salary_max, salary_confidence,
+                 english_level, roles_json, skills_json, primary_languages_json,
+                 roles_norm, skills_norm, backend_focus,
+                 created_at, updated_at
+               ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+            (
+                candidate_id,
+                fields.get("name"),
+                fields.get("location"),
+                fields.get("remote"),
+                fields.get("experience_years"),
+                fields.get("experience_years_eng"), # new field
+                fields.get("experience_confidence"),
+                fields.get("salary_min"),
+                fields.get("salary_max"),
+                fields.get("salary_confidence"),
+                fields.get("english_level"),
+                safe_json(fields.get("roles", [])),
+                safe_json(fields.get("skills", [])),
+                primary_languages_json,
+                fields.get("roles_norm") or "|",
+                fields.get("skills_norm") or "|",
+                backend_focus_int,
+                utc_iso(),
+                utc_iso(),
+            ),
+        )
+    else:
+        con.execute(
+            """UPDATE candidates SET
+                 name = COALESCE(?, name),
+                 location = COALESCE(?, location),
+                 remote = COALESCE(?, remote),
+
+                 experience_years = COALESCE(?, experience_years),
+                 experience_years_eng = COALESCE(?, experience_years_eng),
+                 experience_confidence = COALESCE(?, experience_confidence),
+
+                 salary_min = COALESCE(?, salary_min),
+                 salary_max = COALESCE(?, salary_max),
+                 salary_confidence = COALESCE(?, salary_confidence),
+
+                 english_level = COALESCE(?, english_level),
+
+                 roles_json = CASE WHEN ? IS NOT NULL AND ? != '[]' THEN ? ELSE roles_json END,
+                 skills_json = CASE WHEN ? IS NOT NULL AND ? != '[]' THEN ? ELSE skills_json END,
+                 primary_languages_json = CASE WHEN ? IS NOT NULL AND ? != '[]' THEN ? ELSE primary_languages_json END,
+                 roles_norm = CASE WHEN ? != '|' THEN ? ELSE roles_norm END,
+                 skills_norm = CASE WHEN ? != '|' THEN ? ELSE skills_norm END,
+                 backend_focus = COALESCE(?, backend_focus),
+
+                 updated_at = ?
+               WHERE candidate_id = ?""",
+            (
+                fields.get("name"),
+                fields.get("location"),
+                fields.get("remote"),
+                fields.get("experience_years"),
+                fields.get("experience_years_eng"), # new field update
+                fields.get("experience_confidence"),
+                fields.get("salary_min"),
+                fields.get("salary_max"),
+                fields.get("salary_confidence"),
+                fields.get("english_level"),
+
+                safe_json(fields.get("roles", [])),
+                safe_json(fields.get("roles", [])),
+                safe_json(fields.get("roles", [])),
+
+                safe_json(fields.get("skills", [])),
+                safe_json(fields.get("skills", [])),
+                safe_json(fields.get("skills", [])),
+
+                primary_languages_json,
+                primary_languages_json,
+                primary_languages_json,
+
+                fields.get("roles_norm") or "|",
+                fields.get("roles_norm") or "|",
+
+                fields.get("skills_norm") or "|",
+                fields.get("skills_norm") or "|",
+
+                backend_focus_int,
+                utc_iso(),
+                candidate_id,
+            ),
+        )
+
+
+def _resume_by_sha(con: sqlite3.Connection, sha: str) -> Optional[str]:
+    row = con.execute("SELECT resume_id FROM resumes WHERE sha256=?", (sha,)).fetchone()
+    return row["resume_id"] if row else None
+
+
+def _near_duplicate_active_resume(con: sqlite3.Connection, simh: int, max_dist: int) -> Optional[Tuple[str, int]]:
+    candidate_resume_ids = set()
+    for bucket, band in simhash_bands(simh):
+        cur = con.execute("SELECT resume_id FROM simhash_buckets WHERE bucket=? AND band=?", (bucket, band))
+        for r in cur.fetchall():
+            candidate_resume_ids.add(r["resume_id"])
+
+    best: Optional[Tuple[str, int]] = None
+    for rid in candidate_resume_ids:
+        row = con.execute("SELECT simhash FROM resumes WHERE resume_id=? AND is_active=1", (rid,)).fetchone()
+        if not row or row["simhash"] is None:
+            continue
+        try:
+            old = int(str(row["simhash"]), 16)
+        except Exception:
+            continue
+        dist = hamming64(old, simh)
+        if dist <= max_dist:
+            if best is None or dist < best[1]:
+                best = (rid, dist)
+    return best
+
+
+def _insert_resume(
+    con: sqlite3.Connection,
+    candidate_id: str,
+    sha: Optional[str],
+    simh: int,
+    clean_text: str,
+    raw_text: str,
+    extraction_json: str,
+    llm_summary: Optional[str],
+    llm_tags: List[str],
+    extract_method: Optional[str],
+    extract_quality_score: Optional[float],
+    extract_quality_flags: Optional[str],
+    extract_pages_json: Optional[str],
+    doc_type: Optional[str],
+    doc_type_confidence: Optional[float],
+    parse_method: Optional[str],
+    parse_version: Optional[str],
+    sections_json: Optional[str],
+    file_path: Optional[str],
+    mtime: Optional[int],
+    size: Optional[int],
+    near_dup_of: Optional[str],
+) -> str:
+    resume_id = "res_" + uuid.uuid4().hex
+
+    if near_dup_of:
+        con.execute("UPDATE resumes SET is_active=0 WHERE resume_id=?", (near_dup_of,))
+
+    con.execute(
+        """INSERT INTO resumes(
+             resume_id, candidate_id, sha256, simhash, clean_text, raw_text, extraction_json,
+             llm_summary, llm_tags_json,
+             extract_method, extract_quality_score, extract_quality_flags, extract_pages_json,
+             doc_type, doc_type_confidence, parse_method, parse_version, sections_json,
+             is_active, duplicate_of_resume_id, file_path, file_mtime, file_size, created_at
+           ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+        (
+            resume_id,
+            candidate_id,
+            sha,
+            f"{simh:016x}",
+            clean_text,
+            raw_text[:250000],
+            extraction_json,
+            llm_summary,
+            safe_json(llm_tags),
+            extract_method,
+            extract_quality_score,
+            extract_quality_flags,
+            extract_pages_json,
+            doc_type,
+            doc_type_confidence,
+            parse_method,
+            parse_version,
+            sections_json,
+            1,
+            near_dup_of,
+            file_path,
+            mtime,
+            size,
+            utc_iso(),
+        ),
+    )
+
+    for bucket, band in simhash_bands(simh):
+        con.execute(
+            "INSERT OR IGNORE INTO simhash_buckets(bucket, band, resume_id) VALUES (?,?,?)",
+            (bucket, band, resume_id),
+        )
+
+    return resume_id
+
+
+def _insert_source(con: sqlite3.Connection, resume_id: str, src: Dict[str, Any]) -> None:
+    con.execute(
+        """INSERT INTO sources(
+             resume_id, export_path, chat_title, message_id, message_date,
+             origin_type, original_file_path, original_file_name, extra_json
+           ) VALUES (?,?,?,?,?,?,?,?,?)""",
+        (
+            resume_id,
+            src.get("export_path"),
+            src.get("chat_title"),
+            src.get("message_id"),
+            src.get("message_date"),
+            src.get("origin_type"),
+            src.get("file_path"),
+            src.get("original_name"),
+            json.dumps(src.get("extra", {}), ensure_ascii=False),
+        ),
+    )
+
+
+def _insert_positions(
+    con: sqlite3.Connection,
+    resume_id: str,
+    candidate_id: str,
+    positions: List[Dict[str, Any]],
+) -> None:
+    if not positions:
+        return
+    for p in positions:
+        pos_id = "pos_" + uuid.uuid4().hex
+        con.execute(
+            """INSERT INTO positions(
+                 position_id, resume_id, candidate_id, title, company,
+                 date_from, date_to, is_current, description, stack_json
+               ) VALUES (?,?,?,?,?,?,?,?,?,?)""",
+            (
+                pos_id,
+                resume_id,
+                candidate_id,
+                p.get("title"),
+                p.get("company"),
+                p.get("date_from"),
+                p.get("date_to"),
+                1 if p.get("is_current") else 0 if p.get("is_current") is not None else None,
+                p.get("description"),
+                json.dumps(p.get("stack") or [], ensure_ascii=False),
+            ),
+        )
+
+
+def _update_files_seen(con: sqlite3.Connection, sha: str, size: int, mtime: int, canonical_resume_id: str) -> None:
+    con.execute(
+        """INSERT INTO files_seen(sha256, size, mtime, canonical_resume_id, first_seen_at, last_seen_at)
+           VALUES (?,?,?,?,?,?)
+           ON CONFLICT(sha256) DO UPDATE SET
+             size=excluded.size,
+             mtime=excluded.mtime,
+             canonical_resume_id=excluded.canonical_resume_id,
+             last_seen_at=excluded.last_seen_at
+        """,
+        (sha, size, mtime, canonical_resume_id, utc_iso(), utc_iso()),
+    )
+
+
+# -----------------------------
+# artifacts collection
+# -----------------------------
+
+def collect_artifacts(input_root: Path) -> List[Dict[str, Any]]:
+    artifacts: List[Dict[str, Any]] = []
+
+    for rj in find_result_json(input_root):
+        artifacts.extend(list(iter_json_artifacts(rj)))
+
+    for mh in find_messages_html(input_root):
+        artifacts.extend(list(iter_html_artifacts(mh)))
+
+    artifacts.extend(list(iter_file_scan(input_root)))
+    return artifacts
+
+
+# -----------------------------
+# main pipeline
+# -----------------------------
+
+def import_exports(
+    con: sqlite3.Connection,
+    input_dir: str,
+    log: Logger,
+    max_near_dist: int = 6,
+    min_text_len: int = 250,
+    commit_every: int = 20,
+) -> Dict[str, Any]:
+    root = Path(input_dir).resolve()
+    if not root.exists():
+        raise SystemExit(f"Input not found: {root}")
+
+    artifacts = collect_artifacts(root)
+    log.info(f"[import] artifacts found: {len(artifacts)}", {"input": str(root)})
+
+    stats: Dict[str, Any] = {
+        "input": str(root),
+        "artifacts": len(artifacts),
+        "processed_new": 0,
+        "dup_sha": 0,
+        "near_dup": 0,
+        "short_or_empty": 0,
+        "errors": 0,
+        "sources_added_only": 0,
+        "llm_enriched": 0,
+        "llm_reviewed": 0,
+        "llm_review_changed": 0,
+    }
+    near_dup_examples: List[Dict[str, Any]] = []
+
+    for i, a in enumerate(artifacts, start=1):
+        try:
+            raw_text = ""
+            sha = None
+
+            file_path = a.get("file_path")
+            size = None
+            mtime = None
+
+            extract_method = None
+            extract_score = None
+            extract_flags: List[str] = []
+            pages: List[Dict[str, Any]] = []
+
+            if file_path:
+                fp = Path(file_path)
+                if not fp.exists():
+                    continue
+                st = fp.stat()
+                size = int(st.st_size)
+                mtime = int(st.st_mtime)
+
+                sha = sha256_file(str(fp))
+
+                existing_resume = _resume_by_sha(con, sha)
+                if existing_resume:
+                    stats["dup_sha"] += 1
+                    _insert_source(con, existing_resume, a)
+                    _update_files_seen(con, sha, size, mtime, existing_resume)
+                    stats["sources_added_only"] += 1
+                    continue
+
+                if fp.suffix.lower() == ".pdf":
+                    pdf_res = extract_pdf_best(fp, timeout_sec=25)
+                    raw_text = pdf_res.text
+                    extract_method = pdf_res.method
+                    extract_score = pdf_res.score
+                    extract_flags = pdf_res.flags
+                    pages = pdf_res.pages
+                else:
+                    try:
+                        raw_text = extract_text_generic(fp) or ""
+                    except Exception as e:
+                        if log:
+                            log.warn("[extract] file failed - skipped", {"file": str(fp), "err": repr(e)})
+                        raw_text = ""
+                    extract_method = f"file_{fp.suffix.lower().lstrip('.') or 'unknown'}"
+            else:
+                raw_text = a.get("message_text") or ""
+                extract_method = "telegram_post"
+
+            raw_text = coerce_text(raw_text)
+
+            if not raw_text or len(raw_text.strip()) < min_text_len:
+                stats["short_or_empty"] += 1
+                continue
+
+            clean = normalize_text(raw_text)
+            if not clean or len(clean) < min_text_len:
+                stats["short_or_empty"] += 1
+                continue
+
+            file_ext = Path(file_path).suffix.lower() if file_path else None
+            dt = detect_doc_type(clean, file_ext=file_ext)
+            if a.get("origin_type") == "message_text":
+                from tg_resume_db.extract.doc_type import DocTypeResult
+                dt = DocTypeResult(doc_type="telegram_post", confidence=0.92, signals=["telegram_message"])
+            if extract_flags and "scan_like" in extract_flags:
+                from tg_resume_db.extract.doc_type import DocTypeResult
+                dt = DocTypeResult(doc_type="scan_pdf", confidence=0.9, signals=dt.signals + ["scan_like"])
+            sections = split_sections(clean, dt.doc_type)
+            sections_list = sections_present(sections)
+            exp_section_text = sections.get("experience") if isinstance(sections, dict) else None
+            positions = extract_positions(exp_section_text or clean)
+            position_dicts = positions_to_dicts(positions)
+
+            parser = tpl_generic
+            if dt.confidence >= 0.8:
+                if dt.doc_type == "hh_ru":
+                    parser = tpl_hh
+                elif dt.doc_type == "linkedin_pdf":
+                    parser = tpl_linkedin
+                elif dt.doc_type == "one_page_en":
+                    parser = tpl_one_page_en
+                elif dt.doc_type == "one_page_ru":
+                    parser = tpl_one_page_ru
+                elif dt.doc_type == "pptx_export":
+                    parser = tpl_pptx
+
+            parsed = parser.parse_resume(clean, sections)
+            parse_method = parsed.get("parse_method") or "generic_heur"
+
+            contacts_raw = parsed.get("contacts_raw") or extract_contacts_raw(clean)
+            contacts = normalize_contacts(contacts_raw, clean)
+
+            name = parsed.get("name") or extract_name_guess(clean)
+            remote = parsed.get("remote")
+            if remote is None:
+                remote = extract_remote(clean)
+            english = parsed.get("english") or extract_english(clean)
+
+            roles = parsed.get("roles") or []
+            skills = parsed.get("skills") or []
+            primary_languages: List[str] = []
+
+            location = parsed.get("location") or extract_location_best_effort(clean)
+
+            exp_years = parsed.get("exp_years")
+            exp_years_eng = parsed.get("exp_years_eng")
+            exp_conf = parsed.get("exp_conf")
+            exp_dbg = parsed.get("exp_dbg") or {}
+            if exp_years is None and exp_years_eng is None:
+                exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(clean)
+            exp_years, exp_years_eng, exp_conf, exp_dbg = _prefer_explicit_summary_experience(
+                clean_text=clean,
+                exp_years=exp_years,
+                exp_years_eng=exp_years_eng,
+                exp_conf=exp_conf,
+                exp_dbg=exp_dbg,
+            )
+            exp_years, exp_years_eng, exp_conf, exp_dbg = _reconcile_experience_fields(
+                exp_years=exp_years,
+                exp_years_eng=exp_years_eng,
+                exp_conf=exp_conf,
+                exp_dbg=exp_dbg,
+                positions=position_dicts,
+            )
+
+            sal_min = parsed.get("salary_min")
+            sal_max = parsed.get("salary_max")
+            sal_conf = parsed.get("salary_conf")
+            sal_dbg = parsed.get("salary_dbg") or {}
+            if sal_min is None and sal_max is None:
+                sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean)
+
+            llm_summary: Optional[str] = None
+            llm_tags: List[str] = []
+            seniority: Optional[str] = None
+            highlights: List[str] = []
+            keywords: List[str] = []
+
+            llm_enriched, llm_dbg = _maybe_llm_enrich(
+                con=con,
+                clean=clean,
+                roles=roles,
+                skills=skills,
+                exp_conf=exp_conf,
+                english=english,
+                location=location,
+                name=name,
+                doc_type=dt.doc_type,
+                sections=sections,
+            )
+
+            backend_focus_flag: Optional[bool] = None
+
+            if llm_enriched:
+                parse_method = "llm_rag"
+                stats["llm_enriched"] += 1
+                roles = _merge_lists(llm_enriched.roles, roles, limit=8)
+
+                normalized_llm_langs = _normalize_language_list(llm_enriched.primary_languages)
+                if normalized_llm_langs:
+                    primary_languages = _merge_lists(normalized_llm_langs, primary_languages, limit=8)
+                    skills = _merge_lists(normalized_llm_langs, skills, limit=48)
+                skills = _merge_lists(llm_enriched.skills, skills, limit=48)
+
+                if remote is None and llm_enriched.remote_ok is not None:
+                    remote = llm_enriched.remote_ok
+                if not location and llm_enriched.location:
+                    location = llm_enriched.location
+                if not english and llm_enriched.english_level and _can_accept_llm_english(clean, llm_enriched.english_level):
+                    english = llm_enriched.english_level
+
+                backend_focus_flag = llm_enriched.backend_focus
+                if llm_enriched.backend_focus is True:
+                    roles = _merge_lists(["backend"], roles, limit=8)
+                elif llm_enriched.backend_focus is False:
+                    pruned_roles: List[str] = []
+                    seen_roles = set()
+                    for r in roles:
+                        if r.lower() == "backend":
+                            continue
+                        rl = r.lower()
+                        if rl in seen_roles:
+                            continue
+                        seen_roles.add(rl)
+                        pruned_roles.append(r)
+                    roles = pruned_roles
+
+                if (exp_conf is None or exp_conf < 0.6) and llm_enriched.experience_years_total is not None:
+                    exp_years = llm_enriched.experience_years_total
+                    exp_conf = 0.65
+                if llm_enriched.experience_years_engineering is not None:
+                    exp_years_eng = llm_enriched.experience_years_engineering
+
+                sal_min, sal_max, sal_conf = _pick_salary(
+                    sal_min, sal_max, sal_conf, llm_enriched.salary_min_rub, llm_enriched.salary_max_rub
+                )
+                if sal_min is None and sal_max is None:
+                    sal_min, sal_max, sal_conf = _pick_salary(
+                        sal_min, sal_max, sal_conf, llm_enriched.salary_min_usd, llm_enriched.salary_max_usd
+                    )
+
+                seniority = llm_enriched.seniority
+                highlights = [h.strip() for h in llm_enriched.highlights if h.strip()]
+                if highlights:
+                    llm_summary = "; ".join(highlights)[:800]
+                keywords = _merge_lists(llm_enriched.keywords, keywords, limit=40)
+                llm_tags = _merge_lists(llm_enriched.keywords, llm_tags, limit=24)
+                llm_tags = _merge_lists(llm_enriched.skills, llm_tags, limit=24)
+                llm_tags = _merge_lists(llm_enriched.primary_languages, llm_tags, limit=24)
+
+            desired_title = parsed.get("desired_title")
+            if desired_title:
+                roles = _merge_lists(_roles_from_desired_title(desired_title), roles, limit=8)
+
+            llm_review_mode = _llm_review_mode()
+            llm_review_rounds_dbg: List[Dict[str, Any]] = []
+            llm_review_merge_dbg: List[Dict[str, Any]] = []
+            llm_review_used = False
+            llm_review_changed = False
+
+            if llm_parse_enabled() and _llm_review_needed(
+                mode=llm_review_mode,
+                llm_enriched_used=bool(llm_enriched),
+                name=name,
+                roles=roles,
+                skills=skills,
+                exp_conf=exp_conf,
+                english=english,
+                location=location,
+            ):
+                for _ in range(_llm_review_rounds()):
+                    review_draft = _build_llm_review_draft(
+                        roles=roles,
+                        skills=skills,
+                        primary_languages=primary_languages,
+                        seniority=seniority,
+                        backend_focus=backend_focus_flag,
+                        exp_years=exp_years,
+                        exp_years_eng=exp_years_eng,
+                        english=english,
+                        location=location,
+                        remote=remote,
+                        sal_min=sal_min,
+                        sal_max=sal_max,
+                        highlights=highlights,
+                        keywords=keywords,
+                    )
+                    review_res, review_dbg = llm_review_profile(
+                        clean,
+                        draft=review_draft,
+                        con=con,
+                        doc_type=dt.doc_type,
+                        sections=sections,
+                    )
+                    llm_review_rounds_dbg.append(review_dbg)
+                    if not review_res:
+                        continue
+
+                    llm_review_used = True
+                    merged, merge_dbg = _merge_review_result(
+                        review=review_res,
+                        review_dbg=review_dbg,
+                        roles=roles,
+                        skills=skills,
+                        primary_languages=primary_languages,
+                        seniority=seniority,
+                        backend_focus=backend_focus_flag,
+                        remote=remote,
+                        location=location,
+                        english=english,
+                        exp_years=exp_years,
+                        exp_years_eng=exp_years_eng,
+                        exp_conf=exp_conf,
+                        sal_min=sal_min,
+                        sal_max=sal_max,
+                        sal_conf=sal_conf,
+                        highlights=highlights,
+                        keywords=keywords,
+                        llm_summary=llm_summary,
+                        llm_tags=llm_tags,
+                    )
+                    llm_review_merge_dbg.append(merge_dbg)
+
+                    roles = merged["roles"]
+                    skills = merged["skills"]
+                    primary_languages = merged["primary_languages"]
+                    seniority = merged["seniority"]
+                    backend_focus_flag = merged["backend_focus"]
+                    remote = merged["remote"]
+                    location = merged["location"]
+                    english = merged["english"]
+                    exp_years = merged["exp_years"]
+                    exp_years_eng = merged["exp_years_eng"]
+                    exp_conf = merged["exp_conf"]
+                    sal_min = merged["sal_min"]
+                    sal_max = merged["sal_max"]
+                    sal_conf = merged["sal_conf"]
+                    highlights = merged["highlights"]
+                    keywords = merged["keywords"]
+                    llm_summary = merged["llm_summary"]
+                    llm_tags = merged["llm_tags"]
+
+                    if merge_dbg.get("changed_fields"):
+                        llm_review_changed = True
+                    else:
+                        break
+
+            if llm_review_used:
+                stats["llm_reviewed"] += 1
+                if llm_review_changed:
+                    stats["llm_review_changed"] += 1
+                if "+llm_review" not in parse_method:
+                    parse_method = f"{parse_method}+llm_review"
+
+            llm_review_meta = {
+                "enabled": llm_parse_enabled(),
+                "mode": llm_review_mode,
+                "used": llm_review_used,
+                "changed": llm_review_changed,
+                "rounds": llm_review_rounds_dbg,
+                "merge": llm_review_merge_dbg,
+            }
+
+            roles = normalize_roles(roles)
+            roles = _prune_roles_by_evidence(roles, clean)
+            skills = normalize_skills(skills)
+
+            skills_primary, skills_secondary = split_skills_primary_secondary(
+                skills,
+                clean_text=clean,
+                sections=sections,
+            )
+
+            location = normalize_location(location)
+
+            exp_years, exp_years_eng, exp_conf, exp_dbg = _reconcile_experience_fields(
+                exp_years=exp_years,
+                exp_years_eng=exp_years_eng,
+                exp_conf=exp_conf,
+                exp_dbg=exp_dbg,
+                positions=position_dicts,
+            )
+
+            if not primary_languages:
+                language_from_skills = []
+                for sk in skills:
+                    tok = _norm_lang_token(sk)
+                    if tok:
+                        language_from_skills.append(tok)
+                primary_languages = _merge_lists(language_from_skills, primary_languages, limit=8)
+
+            skills, primary_languages = _drop_false_java(skills, primary_languages, clean)
+
+            simh = simhash64(to_fts_text(clean))
+
+            candidate_id = _candidate_by_contact(con, contacts) or stable_candidate_id(contacts, name, simh)
+
+            _ensure_candidate(con, candidate_id, {
+                "name": name,
+                "location": location,
+                "remote": (1 if remote else 0) if remote is not None else None,
+                "experience_years": exp_years,
+                "experience_years_eng": exp_years_eng,  # Passed to DB
+                "experience_confidence": exp_conf if exp_years is not None else None,
+                "salary_min": sal_min,
+                "salary_max": sal_max,
+                "salary_confidence": sal_conf if sal_min is not None else None,
+                "english_level": english,
+                "roles": roles,
+                "skills": skills,
+                "primary_languages": primary_languages,
+                "backend_focus": backend_focus_flag,
+                "roles_norm": norm_pipe(roles),
+                "skills_norm": norm_pipe(skills),
+            })
+            _upsert_contacts(con, candidate_id, contacts)
+            _upsert_candidate_skills(con, candidate_id, skills_primary, skills_secondary, parse_method)
+            _upsert_candidate_roles(con, candidate_id, roles, parse_method)
+            _upsert_candidate_languages(con, candidate_id, english, parse_method)
+
+            near = _near_duplicate_active_resume(con, simh, max_dist=max_near_dist)
+            near_dup_of = near[0] if near else None
+            if near_dup_of:
+                stats["near_dup"] += 1
+                if len(near_dup_examples) < 10:
+                    near_dup_examples.append({
+                        "new_file": file_path,
+                        "dup_of": near_dup_of,
+                        "dist": near[1],
+                        "candidate_id": candidate_id,
+                    })
+
+            extraction = {
+                "name_guess": name,
+                "contacts": contacts,
+                "doc_type": {
+                    "type": dt.doc_type,
+                    "confidence": dt.confidence,
+                    "signals": dt.signals,
+                },
+                "extract": {
+                    "method": extract_method,
+                    "quality_score": extract_score,
+                    "quality_flags": extract_flags,
+                    "pages": pages[:40],
+                },
+                "sections_present": sections_list,
+                "parse": {
+                    "method": parse_method,
+                    "version": _PARSE_VERSION,
+                },
+                "desired_title": desired_title,
+                "skills_primary": skills_primary,
+                "skills_secondary": skills_secondary,
+                "hh_meta": {
+                    "specializations": parsed.get("specializations"),
+                    "employment_type": parsed.get("employment_type"),
+                    "schedule": parsed.get("schedule"),
+                },
+                "positions": positions_to_dicts(positions),
+                "positions_count": len(position_dicts),
+                "experience": {
+                    "years": exp_years, 
+                    "years_engineering": exp_years_eng, # Saved in JSON too
+                    "confidence": exp_conf, 
+                    "debug": exp_dbg
+                },
+                "salary": {"min": sal_min, "max": sal_max, "confidence": sal_conf, "debug": sal_dbg},
+                "location_guess": location,
+                "roles": roles,
+                "skills": skills,
+                "primary_languages": primary_languages,
+                "remote_guess": remote,
+                "english": english,
+                "llm_summary": llm_summary,
+                "llm_tags": llm_tags,
+                "seniority": seniority,
+                "backend_focus": backend_focus_flag,
+                "highlights": highlights,
+                "keywords": keywords,
+                "llm": {
+                    "used": bool(llm_enriched),
+                    "debug": llm_dbg,
+                    "data": asdict(llm_enriched) if llm_enriched else None,
+                    "review": llm_review_meta,
+                },
+            }
+
+            resume_id = _insert_resume(
+                con=con,
+                candidate_id=candidate_id,
+                sha=sha,
+                simh=simh,
+                clean_text=clean,
+                raw_text=raw_text,
+                extraction_json=json.dumps(extraction, ensure_ascii=False),
+                llm_summary=llm_summary,
+                llm_tags=llm_tags,
+                extract_method=extract_method,
+                extract_quality_score=extract_score,
+                extract_quality_flags=json.dumps(extract_flags, ensure_ascii=False),
+                extract_pages_json=json.dumps(pages[:40], ensure_ascii=False),
+                doc_type=dt.doc_type,
+                doc_type_confidence=dt.confidence,
+                parse_method=parse_method,
+                parse_version=_PARSE_VERSION,
+                sections_json=json.dumps(sections, ensure_ascii=False),
+                file_path=file_path,
+                mtime=mtime,
+                size=size,
+                near_dup_of=near_dup_of,
+            )
+
+            _insert_source(con, resume_id, a)
+            _insert_positions(con, resume_id, candidate_id, position_dicts)
+
+            if sha and size is not None and mtime is not None:
+                _update_files_seen(con, sha, size, mtime, resume_id)
+
+            stats["processed_new"] += 1
+
+            if i % commit_every == 0:
+                con.commit()
+                log.info(
+                    f"[import] progress {i}/{len(artifacts)} "
+                    f"new={stats['processed_new']} dup_sha={stats['dup_sha']} "
+                    f"near={stats['near_dup']} err={stats['errors']}",
+                    {},
+                )
+
+        except Exception as e:
+            stats["errors"] += 1
+            log.error("[import] artifact failed", {"err": repr(e), "artifact": a})
+
+    con.commit()
+    stats["near_dup_examples"] = near_dup_examples
+    log.info("[import] done", stats)
+    return stats
diff --git a/search.py b/search.py
new file mode 100644
index 0000000..11ecfb2
--- /dev/null
+++ b/search.py
@@ -0,0 +1,393 @@
+from __future__ import annotations
+
+import json
+import re
+import sqlite3
+from typing import Any, Dict, List, Tuple
+
+from tg_resume_db.normalize import normalize_skill, find_skills_in_text
+
+
+# -----------------------------
+# Normalization helpers
+# -----------------------------
+
+def _norm_token(v: str) -> str:
+    return " ".join(str(v).strip().lower().split())
+
+
+def _as_list(v: Any) -> List[str]:
+    """
+    Accepts:
+      - None
+      - list
+      - "a,b,c" (csv string)
+    """
+    if v is None:
+        return []
+    if isinstance(v, list):
+        return [str(x) for x in v if str(x).strip()]
+    s = str(v).strip()
+    if not s:
+        return []
+    return [x.strip() for x in s.split(",") if x.strip()]
+
+
+def _uniq_keep_order(xs: List[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for x in xs:
+        t = _norm_token(x)
+        if not t or t in seen:
+            continue
+        seen.add(t)
+        out.append(t)
+    return out
+
+
+# -----------------------------
+# Pipe-normalized columns filters
+# skills_norm / roles_norm like: "|python|fastapi|"
+# -----------------------------
+
+def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
+    vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
+    if not vals:
+        return ("1=1", [])
+
+    parts: List[str] = []
+    args: List[Any] = []
+    for v in vals:
+        parts.append(f"instr({field}, ?) > 0")
+        args.append(f"|{v}|")
+
+    return "(" + " OR ".join(parts) + ")", args
+
+
+def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
+    vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
+    if not vals:
+        return ("1=1", [])
+
+    parts: List[str] = []
+    args: List[Any] = []
+    for v in vals:
+        parts.append(f"instr({field}, ?) > 0")
+        args.append(f"|{v}|")
+
+    return "(" + " AND ".join(parts) + ")", args
+
+
+# -----------------------------
+# FTS5 sanitizer (fixes comma/garbage breaking MATCH)
+# -----------------------------
+
+# allow longer queries (списки имён, длинные промпты) без агрессивного усечения
+_FTS_MAX_TERMS = 48
+
+def _fts_safe_query(q: str) -> str:
+    """
+    Turn a free-form recruiter text into a safe FTS5 MATCH expression.
+    We intentionally DO NOT allow raw FTS syntax from user input,
+    because it easily breaks on commas/quotes/etc.
+
+    Example:
+      "Backend developer, опыт 5+ лет, Java C++ Python" ->
+      "\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\""
+    """
+    if not q:
+        return "resume"
+
+    s = q.strip().lower()
+
+    # normalize common tokens
+    s = s.replace("c++", "cpp")
+    s = s.replace("c#", "csharp")
+    s = s.replace(".net", "dotnet")
+
+    # remove punctuation that breaks MATCH
+    s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+
+    # tokens (latin/cyrillic + digits + a few chars)
+    terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I)
+    terms = terms[:_FTS_MAX_TERMS]
+
+    if not terms:
+        return "resume"
+
+    # quote every term => safe; join with OR => broad query
+    return " OR ".join([f"\"{t}\"" for t in terms])
+
+
+def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]:
+    """
+    Extract +must and -exclude skills from query; return (must, exclude, cleaned_query).
+    """
+    if not q:
+        return [], [], ""
+    must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q)
+    excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q)
+    must = []
+    exclude = []
+    for t in must_raw:
+        canon = normalize_skill(t)
+        if canon:
+            must.append(canon)
+    for t in excl_raw:
+        canon = normalize_skill(t)
+        if canon:
+            exclude.append(canon)
+    if " and " in q.lower() or " & " in q:
+        must += find_skills_in_text(q)
+
+    cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned
+
+
+# -----------------------------
+# Contacts
+# -----------------------------
+
+def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]:
+    rows = con.execute(
+        "SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?",
+        (candidate_id,),
+    ).fetchall()
+
+    m: Dict[str, List[str]] = {}
+    for r in rows:
+        m.setdefault(r["contact_type"], []).append(r["contact_value"])
+
+    # чуть чище: уберём дубль-контакты
+    for k, vals in list(m.items()):
+        m[k] = _uniq_keep_order(vals)
+
+    return m
+
+
+# -----------------------------
+# Main search (FTS + filters)
+# -----------------------------
+
+def search(
+    con: sqlite3.Connection,
+    query: str,
+    filters: Dict[str, Any],
+    limit: int = 20,
+    offset: int = 0,
+) -> List[Dict[str, Any]]:
+    """
+    Search candidates using:
+      - FTS5 for ranking/snippet
+      - stack filters for skills/roles via pipe-normalized columns
+      - basic filters: remote/location/experience/salary/english
+    """
+
+    where: List[str] = ["r.is_active = 1"]
+    params: List[Any] = []
+
+    must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "")
+
+    # -------- basic filters --------
+    if filters.get("remote") is not None:
+        where.append("c.remote = ?")
+        params.append(1 if bool(filters["remote"]) else 0)
+
+    if filters.get("location"):
+        where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?")
+        params.append("%" + str(filters["location"]).lower() + "%")
+
+    # Используем experience_years для SQL-фильтрации (широкий поиск),
+    # а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py
+    if filters.get("experience_min") is not None:
+        where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?")
+        params.append(float(filters["experience_min"]))
+
+    # Salary: "unknown salary doesn't exclude"
+    if filters.get("salary_min") is not None:
+        where.append("(c.salary_max IS NULL OR c.salary_max >= ?)")
+        params.append(int(filters["salary_min"]))
+
+    if filters.get("salary_max") is not None:
+        where.append("(c.salary_min IS NULL OR c.salary_min <= ?)")
+        params.append(int(filters["salary_max"]))
+
+    if filters.get("doc_type"):
+        where.append("r.doc_type = ?")
+        params.append(str(filters["doc_type"]))
+
+    # English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py
+
+    # -------- roles/skills stack filters --------
+    # backward compatibility
+    skills_any: List[str] = []
+    skills_all: List[str] = []
+    roles_any: List[str] = []
+
+    if filters.get("skill"):
+        skills_any.append(str(filters["skill"]))
+    if filters.get("role"):
+        roles_any.append(str(filters["role"]))
+
+    skills_any += _as_list(filters.get("skills_any"))
+    skills_all += _as_list(filters.get("skills_all"))
+    roles_any += _as_list(filters.get("roles_any"))
+
+    skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any])
+    skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all])
+    roles_any = _uniq_keep_order(roles_any)
+
+    if must_skills:
+        skills_all = _uniq_keep_order(skills_all + must_skills)
+
+    # Denis rule: if any skills were provided -> enforce ANY match
+    if skills_any:
+        clause, args = _pipe_any_clause("c.skills_norm", skills_any)
+        where.append(clause)
+        params.extend(args)
+
+    if skills_all:
+        clause, args = _pipe_all_clause("c.skills_norm", skills_all)
+        where.append(clause)
+        params.extend(args)
+
+    if roles_any:
+        clause, args = _pipe_any_clause("c.roles_norm", roles_any)
+        where.append(clause)
+        params.extend(args)
+
+    if exclude_skills:
+        for sk in exclude_skills:
+            where.append("instr(c.skills_norm, ?) = 0")
+            params.append(f"|{sk}|")
+
+    # -------- FTS query (SAFE) --------
+    fts_q = _fts_safe_query(cleaned_query or "")
+
+    limit = max(1, min(int(limit or 20), 100))
+    offset = max(0, int(offset or 0))
+
+    # UPDATED SQL: Added experience_years_eng and language/backend metadata
+    sql = f"""
+    SELECT
+      c.candidate_id,
+      c.name,
+      c.location,
+      c.remote,
+      c.experience_years,
+      c.experience_years_eng,
+      c.experience_confidence,
+      c.salary_min,
+      c.salary_max,
+      c.salary_confidence,
+      c.english_level,
+      c.roles_json,
+      c.skills_json,
+      c.primary_languages_json,
+      c.backend_focus,
+      r.doc_type,
+      r.doc_type_confidence,
+      r.parse_method,
+      r.resume_id,
+      snippet(resumes_fts, 2, '[', ']', '…', 14) AS snippet,
+      bm25(resumes_fts) AS rank
+    FROM resumes_fts
+    JOIN resumes r ON r.resume_id = resumes_fts.resume_id
+    JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id
+    WHERE resumes_fts MATCH ? AND {" AND ".join(where)}
+    ORDER BY rank
+    LIMIT ? OFFSET ?
+    """
+
+    rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall()
+
+    out: List[Dict[str, Any]] = []
+    for row in rows:
+        cand_id = row["candidate_id"]
+        contacts_map = _fetch_contacts_map(con, cand_id)
+
+        out.append(
+            {
+                "candidate_id": cand_id,
+                "name": row["name"],
+                "location": row["location"],
+                "remote": bool(row["remote"]) if row["remote"] is not None else None,
+                "experience_years": row["experience_years"],
+                "experience_years_eng": row["experience_years_eng"], # Passed to agent
+                "experience_confidence": row["experience_confidence"],
+                "salary_min": row["salary_min"],
+                "salary_max": row["salary_max"],
+                "salary_confidence": row["salary_confidence"],
+                "english_level": row["english_level"],
+                "roles": json.loads(row["roles_json"] or "[]"),
+                "skills": json.loads(row["skills_json"] or "[]"),
+                "primary_languages": json.loads(row["primary_languages_json"] or "[]"),
+                "backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None),
+                "doc_type": row["doc_type"],
+                "doc_type_confidence": row["doc_type_confidence"],
+                "parse_method": row["parse_method"],
+                "contacts": contacts_map,
+                "resume_id": row["resume_id"],
+                "snippet": row["snippet"],
+                "rank": row["rank"],
+            }
+        )
+
+    return out
+
+
+# -----------------------------
+# Agent helper (SearchPlan -> search())
+# -----------------------------
+
+def _join_csv(xs: List[str]) -> str:
+    xs = [str(x).strip() for x in (xs or []) if str(x).strip()]
+    return ",".join(xs)
+
+
+def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]:
+    """
+    Wrapper for agent.py.
+    Expects `plan` with fields:
+      query_text, skills_any, skills_all, roles_any, location, remote,
+      english_min, exp_years_min, salary_min, salary_max, limit, sort
+    Returns:
+      { "items": [...], "count": N }
+    """
+    filters = {
+        "remote": getattr(plan, "remote", None),
+        "location": getattr(plan, "location", None),
+        "experience_min": getattr(plan, "exp_years_min", None),
+        "salary_min": getattr(plan, "salary_min", None),
+        "salary_max": getattr(plan, "salary_max", None),
+        "english": getattr(plan, "english_min", None),
+        "roles_any": _join_csv(getattr(plan, "roles_any", []) or []),
+        "skills_any": _join_csv(getattr(plan, "skills_any", []) or []),
+        "skills_all": _join_csv(getattr(plan, "skills_all", []) or []),
+    }
+
+    items = search(
+        con,
+        query=(getattr(plan, "query_text", "") or "").strip(),
+        filters=filters,
+        limit=int(getattr(plan, "limit", 20) or 20),
+        offset=0,
+    )
+
+    sort_mode = (getattr(plan, "sort", "rank") or "rank").strip()
+
+    if sort_mode == "exp_desc":
+        def k(it: Dict[str, Any]):
+            v = it.get("experience_years")
+            return (v is None, -(v or 0.0))
+        items = sorted(items, key=k)
+
+    elif sort_mode == "salary_desc":
+        def k(it: Dict[str, Any]):
+            v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min")
+            return (v is None, -(v or 0))
+        items = sorted(items, key=k)
+
+    return {"items": items, "count": len(items)}
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..b27fe5d
--- /dev/null
+++ b/util.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+def utc_iso() -> str:
+    return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
+
+class Logger:
+    def __init__(self, log_path: Optional[str] = None):
+        self.log_path = Path(log_path) if log_path else None
+        if self.log_path:
+            self.log_path.parent.mkdir(parents=True, exist_ok=True)
+
+    def _write(self, level: str, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
+        line = f"{utc_iso()} [{level}] {msg}"
+        print(line, file=sys.stdout, flush=True)
+        if self.log_path:
+            payload = {"ts": utc_iso(), "level": level, "msg": msg, "extra": extra or {}}
+            with self.log_path.open("a", encoding="utf-8") as f:
+                f.write(json.dumps(payload, ensure_ascii=False) + "\n")
+
+    def info(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
+        self._write("INFO", msg, extra)
+
+    def warn(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
+        self._write("WARN", msg, extra)
+
+    def error(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
+        self._write("ERROR", msg, extra)