Initial commit

2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 .venv/
 __pycache__/
 *.pyc
--- a/init.py
+++ b/init.py
@@ -0,0 +1,2 @@
 __all__ = []
 __version__ = "1.0.0"
--- a/agent.py
+++ b/agent.py
--- a/api.py
+++ b/api.py
@@ -0,0 +1,77 @@
 from __future__ import annotations
 import os
 from typing import Any, Dict, Optional
 from fastapi import FastAPI
 from pydantic import BaseModel, Field
 from tg_resume_db.db import connect, init_db
 from tg_resume_db.agent import agent_search
 from tg_resume_db.search import search as db_search
 DB_PATH = os.environ.get("CANDIDATES_DB", "./candidates.db")
 app = FastAPI(title="Resume Search API", version="1.0")
 class SearchRequest(BaseModel):
    query: str = Field(default="")
    limit: int = Field(default=20, ge=1, le=100)
    offset: int = Field(default=0, ge=0)
    remote: Optional[bool] = None
    location: Optional[str] = None
    experience_min: Optional[float] = None
    salary_min: Optional[int] = None
    salary_max: Optional[int] = None
    english: Optional[str] = None
    role: Optional[str] = None
    skill: Optional[str] = None
 class AISearchRequest(BaseModel):
    prompt: str = Field(default="")
    limit: int = Field(default=20, ge=1, le=100)
    ai_iters: int = Field(default=2, ge=0, le=5)
@app.on_event("startup")
 def _startup():
    con = connect(DB_PATH)
    init_db(con)
    con.close()
@app.get("/health")
 def health():
    return {"ok": True}
@app.post("/search")
 def search(req: SearchRequest) -> Dict[str, Any]:
    con = connect(DB_PATH)
    try:
        items = db_search(con, query=req.query, filters=req.model_dump(), limit=req.limit, offset=req.offset)
        return {"items": items, "count": len(items)}
    finally:
        con.close()
@app.post("/search/ai")
 def search_ai(req: AISearchRequest) -> Dict[str, Any]:
    con = connect(DB_PATH)
    try:
        res = agent_search(
            con,
            user_prompt=req.prompt,
            max_iters=req.ai_iters,
            limit=req.limit,
        )
        return {
            "ai": True,
            "llm_used": res.get("llm_used", False),
            "plan": res.get("plan"),
            "history": res.get("history"),
            "postfilter": res.get("postfilter"),
            "items": res.get("items", []),
            "count": int(res.get("count", 0)),
        }
    finally:
        con.close()
--- a/bundle_export.py
+++ b/bundle_export.py
@@ -0,0 +1,267 @@
 from __future__ import annotations
 import json
 import os
 import re
 import shutil
 import sqlite3
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional
 # NEW: PDF merge helper (pypdf)
 # pip install pypdf
 try:
    from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
 except Exception:  # чтобы не ломать bundle, если pypdf/модуль не поставили
    merge_all_pdfs_in_dir = None
 def _slug(s: str, max_len: int = 60) -> str:
    s = (s or "").strip()
    if not s:
        return "candidate"
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
    s = s.replace(" ", "_")
    s = re.sub(r"_+", "_", s).strip("_")
    if not s:
        return "candidate"
    return s[:max_len]
 def _safe_mkdir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)
 def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
    """
    Возвращает список самых приоритетных путей к файлу резюме.
    1) resumes.file_path
    2) sources.original_file_path
    3) некоторые варианты путей из sources.extra_json
    """
    paths: List[str] = []
    row = con.execute(
        "SELECT file_path FROM resumes WHERE resume_id=?",
        (resume_id,),
    ).fetchone()
    if row and row["file_path"]:
        paths.append(str(row["file_path"]))
    cur = con.execute(
        """SELECT original_file_path, original_file_name, extra_json
           FROM sources
           WHERE resume_id=?""",
        (resume_id,),
    )
    for r in cur.fetchall():
        ofp = r["original_file_path"]
        if ofp:
            paths.append(str(ofp))
        try:
            extra = json.loads(r["extra_json"] or "{}")
            if isinstance(extra, dict):
                for k in ("file_path", "path", "local_path", "source_path"):
                    if extra.get(k):
                        paths.append(str(extra[k]))
        except Exception:
            pass
    # дедуп
    seen = set()
    out: List[str] = []
    for p in paths:
        p2 = os.path.normpath(p)
        if p2 in seen:
            continue
        seen.add(p2)
        out.append(p2)
    return out
 def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
    ext = src.suffix.lower() if src.suffix else ""
    candidate = f"{base_name}{ext}"
    dst = dst_dir / candidate
    if dst.exists():
        i = 2
        while True:
            dst = dst_dir / f"{base_name}({i}){ext}"
            if not dst.exists():
                break
            i += 1
    shutil.copy2(src, dst)
    return dst
 def bundle_search_results(
    con: sqlite3.Connection,
    results: Iterable[Dict[str, Any]],
    out_dir: str,
    *,
    copy_files: bool = True,
    merge_text: bool = True,
    merge_pdf: bool = True,          # NEW
 ) -> Dict[str, Any]:
    """
    results: iterable dictов где есть минимум:
      - resume_id
      - candidate_id
      - name (желательно)
    Создаёт:
      - files/: скопированные исходные файлы резюме
      - merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
      - pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
      - manifest.json
      - README.txt
    """
    out_root = Path(out_dir).resolve()
    files_dir = out_root / "files"
    _safe_mkdir(files_dir)
    manifest: List[Dict[str, Any]] = []
    copied = 0
    missing = 0
    merged_parts: List[str] = []
    merged_txt_path = out_root / "merged_resumes.txt"
    for item in results:
        resume_id = item.get("resume_id")
        cand_id = item.get("candidate_id")
        name = item.get("name") or ""
        if not resume_id or not cand_id:
            continue
        # merged TXT из БД
        if merge_text:
            row = con.execute(
                "SELECT clean_text FROM resumes WHERE resume_id=?",
                (resume_id,),
            ).fetchone()
            clean_text = (row["clean_text"] if row else "") or ""
            header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
            merged_parts.append(header)
            merged_parts.append(clean_text.strip())
            merged_parts.append("")
        if not copy_files:
            continue
        src_paths = _pick_source_paths(con, resume_id)
        src_found: Optional[Path] = None
        for sp in src_paths:
            p = Path(sp)
            if p.exists() and p.is_file():
                src_found = p
                break
        if not src_found:
            missing += 1
            manifest.append(
                {
                    "candidate_id": cand_id,
                    "name": name,
                    "resume_id": resume_id,
                    "copied": False,
                    "reason": "source_file_not_found",
                    "tried_paths": src_paths,
                }
            )
            continue
        base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
        try:
            dst = _copy_unique(src_found, files_dir, base)
            copied += 1
            manifest.append(
                {
                    "candidate_id": cand_id,
                    "name": name,
                    "resume_id": resume_id,
                    "copied": True,
                    "source_path": str(src_found),
                    "dest_path": str(dst),
                }
            )
        except Exception as e:
            missing += 1
            manifest.append(
                {
                    "candidate_id": cand_id,
                    "name": name,
                    "resume_id": resume_id,
                    "copied": False,
                    "reason": f"copy_failed: {repr(e)}",
                    "source_path": str(src_found),
                }
            )
    # merged TXT
    if merge_text:
        merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
    # NEW: merged PDF from files/*.pdf
    merged_pdf_path: Optional[Path] = None
    pdf_info: Optional[Dict[str, Any]] = None
    if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
        try:
            merged_pdf_path = out_root / "pdf" / "merged.pdf"
            _safe_mkdir(merged_pdf_path.parent)
            pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
        except Exception as e:
            pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
    # manifest.json
    (out_root / "manifest.json").write_text(
        json.dumps(
            {
                "out_dir": str(out_root),
                "copied_files": copied,
                "missing_files": missing,
                "merged_text": str(merged_txt_path) if merge_text else None,
                "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
                "pdf_info": pdf_info,
                "items": manifest,
            },
            ensure_ascii=False,
            indent=2,
        ),
        encoding="utf-8",
        errors="ignore",
    )
    # README
    readme_lines = [
        "Папка создана командой search.",
        "- files/: скопированные исходные файлы резюме",
        "- merged_resumes.txt: склейка текста clean_text из БД",
        "- manifest.json: что откуда скопировалось / что не найдено",
    ]
    if merge_pdf:
        if merge_all_pdfs_in_dir is None:
            readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
        else:
            readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
    (out_root / "README.txt").write_text(
        "\n".join(readme_lines) + "\n",
        encoding="utf-8",
        errors="ignore",
    )
    return {
        "out_dir": str(out_root),
        "copied_files": copied,
        "missing_files": missing,
        "merged_text": str(merged_txt_path) if merge_text else None,
        "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
        "manifest": str(out_root / "manifest.json"),
        "pdf_info": pdf_info,
    }
--- a/cli.py
+++ b/cli.py
@@ -0,0 +1,282 @@
 from __future__ import annotations
 import argparse
 import json
 import sys
 from datetime import datetime
 from typing import Any, Dict
 from pathlib import Path
 import os
 from tg_resume_db.bundle_export import bundle_search_results
 from tg_resume_db.db import connect, init_db
 from tg_resume_db.pipeline import import_exports as run_import
 from tg_resume_db.search import search as run_search
 from tg_resume_db.util import Logger
 from tg_resume_db.extract.text_extract import extract_text as extract_text_generic
 from tg_resume_db.extract.pdf_extract import extract_pdf_best
 from tg_resume_db.extract.clean import normalize_text
 from tg_resume_db.extract.doc_type import detect_doc_type
 from tg_resume_db.extract.sections import split_sections, sections_present
 from tg_resume_db.extract.parse import extract_name_guess
 def _print_json(obj: Dict[str, Any]) -> None:
    s = json.dumps(obj, ensure_ascii=False, indent=2)
    try:
        print(s)
    except UnicodeEncodeError:
        # Fallback for cp1251/legacy consoles.
        print(s.encode("ascii", "backslashreplace").decode("ascii"))
 def _is_interactive() -> bool:
    return sys.stdin.isatty() and sys.stdout.isatty()
 def main() -> None:
    ap = argparse.ArgumentParser(prog="tg_resume_db")
    sub = ap.add_subparsers(dest="cmd", required=True)
    # ---------------- import_exports ----------------
    imp = sub.add_parser("import_exports", help="Import Telegram exports recursively (incremental)")
    imp.add_argument("--input", required=True, help="Path to exports directory")
    imp.add_argument("--db", required=True, help="SQLite db path")
    imp.add_argument("--log", default="./import.log", help="Log file path")
    imp.add_argument("--near-dist", type=int, default=6, help="Simhash max Hamming distance for near-duplicates")
    imp.add_argument("--min-text-len", type=int, default=250, help="Skip very short texts")
    imp.add_argument(
        "--llm",
        choices=["auto", "off", "force"],
        default="auto",
        help="LLM enrichment mode: auto (default), off to disable, force to always run when configured",
    )
    imp.add_argument(
        "--llm-review",
        choices=["always", "auto", "off"],
        default="always",
        help="LLM review mode for parsed JSON: always (default), auto, off",
    )
    imp.add_argument(
        "--llm-review-rounds",
        type=int,
        default=1,
        help="How many LLM review merge rounds to run per resume (1..3)",
    )
    # ---------------- search ----------------
    s = sub.add_parser("search", help="Search candidates")
    s.add_argument("--db", required=True)
    s.add_argument("--query", required=True)
    s.add_argument("--limit", type=int, default=20)
    s.add_argument("--offset", type=int, default=0)
    s.add_argument("--remote", choices=["true", "false"], default=None)
    s.add_argument("--location", default=None)
    s.add_argument("--experience-min", type=float, default=None)
    s.add_argument("--salary-min", type=int, default=None)
    s.add_argument("--salary-max", type=int, default=None)
    s.add_argument("--english", default=None)
    s.add_argument("--doc-type", default=None)
    # AI mode
    s.add_argument("--ai", action="store_true", help="Use LLM to build filters from text query and run search")
    s.add_argument("--ai-iters", type=int, default=2, help="How many refine iterations for AI search")
    # Backward compatible single-value filters
    s.add_argument("--role", default=None, help="Single role (backward compatible); prefer --roles-any")
    s.add_argument("--skill", default=None, help="Single skill (backward compatible); prefer --skills-any/--skills-all")
    # Stack filters (comma-separated)
    s.add_argument("--roles-any", default=None, help="Comma-separated roles; at least one must match")
    s.add_argument("--skills-any", default=None, help="Comma-separated skills; at least one must match")
    s.add_argument("--skills-all", default=None, help="Comma-separated skills; all must match")
    # Bundle export behavior
    s.add_argument("--bundle", choices=["ask", "yes", "no"], default="ask", help="Bundle found resumes into a folder")
    # ---------------- inspect ----------------
    ins = sub.add_parser("inspect", help="Inspect a single resume file (doc_type/sections)")
    ins.add_argument("--file", required=True, help="Path to resume file")
    args = ap.parse_args()
    # ========================= import_exports =========================
    if args.cmd == "import_exports":
        con = connect(args.db)
        try:
            init_db(con)
            log = Logger(args.log)
            prev_enabled = os.environ.get("LLM_PARSE_ENABLED")
            prev_force = os.environ.get("LLM_PARSE_FORCE")
            prev_review_mode = os.environ.get("LLM_PARSE_REVIEW_MODE")
            prev_review_rounds = os.environ.get("LLM_PARSE_REVIEW_ROUNDS")
            try:
                if args.llm == "off":
                    os.environ["LLM_PARSE_ENABLED"] = "0"
                    os.environ["LLM_PARSE_REVIEW_MODE"] = "off"
                elif args.llm == "force":
                    os.environ["LLM_PARSE_ENABLED"] = "1"
                    os.environ["LLM_PARSE_FORCE"] = "1"
                    os.environ["LLM_PARSE_REVIEW_MODE"] = "always"
                else:
                    os.environ["LLM_PARSE_REVIEW_MODE"] = args.llm_review
                rounds = max(1, min(int(args.llm_review_rounds), 3))
                os.environ["LLM_PARSE_REVIEW_ROUNDS"] = str(rounds)
                stats = run_import(
                    con=con,
                    input_dir=args.input,
                    log=log,
                    max_near_dist=args.near_dist,
                    min_text_len=args.min_text_len,
                )
            finally:
                if args.llm == "off":
                    if prev_enabled is None:
                        os.environ.pop("LLM_PARSE_ENABLED", None)
                    else:
                        os.environ["LLM_PARSE_ENABLED"] = prev_enabled
                elif args.llm == "force":
                    if prev_enabled is None:
                        os.environ.pop("LLM_PARSE_ENABLED", None)
                    else:
                        os.environ["LLM_PARSE_ENABLED"] = prev_enabled
                    if prev_force is None:
                        os.environ.pop("LLM_PARSE_FORCE", None)
                    else:
                        os.environ["LLM_PARSE_FORCE"] = prev_force
                if prev_review_mode is None:
                    os.environ.pop("LLM_PARSE_REVIEW_MODE", None)
                else:
                    os.environ["LLM_PARSE_REVIEW_MODE"] = prev_review_mode
                if prev_review_rounds is None:
                    os.environ.pop("LLM_PARSE_REVIEW_ROUNDS", None)
                else:
                    os.environ["LLM_PARSE_REVIEW_ROUNDS"] = prev_review_rounds
        finally:
            con.close()
        _print_json(stats)
        return
    # ============================= search =============================
    if args.cmd == "search":
        con = connect(args.db)
        init_db(con)  # важно: гарантирует, что resumes_fts и триггеры существуют
        try:
            items: list[Dict[str, Any]] = []
            out: Dict[str, Any] = {}
            if args.ai:
                from tg_resume_db.agent import agent_search
                res = agent_search(
                    con,
                    user_prompt=args.query,
                    max_iters=args.ai_iters,
                )
                items = res.get("items", [])
                out = {
                    "ai": True,
                    "llm_used": res.get("llm_used", False),
                    "plan": res.get("plan"),
                    "history": res.get("history"),
                    "postfilter": res.get("postfilter"),
                    "items": items,
                    "count": res.get("count", len(items)),
                }
            else:
                filters = {
                    "remote": (args.remote == "true") if args.remote is not None else None,
                    "location": args.location,
                    "experience_min": args.experience_min,
                    "salary_min": args.salary_min,
                    "salary_max": args.salary_max,
                    "english": args.english,
                    "doc_type": args.doc_type,
                    # backward compat
                    "role": args.role,
                    "skill": args.skill,
                    # new
                    "roles_any": args.roles_any,
                    "skills_any": args.skills_any,
                    "skills_all": args.skills_all,
                }
                items = run_search(
                    con,
                    query=args.query,
                    filters=filters,
                    limit=args.limit,
                    offset=args.offset,
                )
                out = {"ai": False, "items": items, "count": len(items)}
            # 1) печатаем результаты
            _print_json(out)
            # 2) bundle prompt/flag
            if args.bundle == "yes":
                do_bundle = True
            elif args.bundle == "no":
                do_bundle = False
            else:  # ask
                do_bundle = False
                if _is_interactive():
                    ans = input("\nСобрать найденные резюме в папку? (Y/N): ").strip().lower()
                    do_bundle = ans in ("y", "yes", "да", "д")
            if do_bundle:
                ts = datetime.now().strftime("%Y%m%d_%H%M%S")
                out_dir = f"./bundle_{ts}"
                info = bundle_search_results(con, items, out_dir, copy_files=True, merge_text=True)
                print(f"\n[done] Готово: {info['out_dir']}")
                print(f"   files copied: {info['copied_files']}, missing: {info['missing_files']}")
                print(f"   merged: {info['merged_text']}")
                print(f"   manifest: {info['manifest']}")
            return
        finally:
            con.close()
    # ============================= inspect =============================
    if args.cmd == "inspect":
        fp = args.file
        path = Path(fp)
        extract_meta = {}
        if path.suffix.lower() == ".pdf":
            pdf_res = extract_pdf_best(path, timeout_sec=25)
            raw_text = pdf_res.text
            extract_meta = {
                "method": pdf_res.method,
                "quality_score": pdf_res.score,
                "quality_flags": pdf_res.flags,
                "pages": len(pdf_res.pages),
            }
        else:
            raw_text = extract_text_generic(path)
            extract_meta = {"method": "generic"}
        clean = normalize_text(raw_text or "")
        dt = detect_doc_type(clean, file_ext=Path(fp).suffix.lower())
        secs = split_sections(clean, dt.doc_type)
        out = {
            "file": fp,
            "doc_type": dt.doc_type,
            "confidence": dt.confidence,
            "signals": dt.signals,
            "extract": extract_meta,
            "sections_present": sections_present(secs),
            "name_guess": extract_name_guess(clean),
        }
        _print_json(out)
        return
 if __name__ == "__main__":
    main()
--- a/db.py
+++ b/db.py
@@ -0,0 +1,296 @@
 from __future__ import annotations
 import sqlite3
 from pathlib import Path
 SCHEMA = r"""
 PRAGMA journal_mode=WAL;
 PRAGMA synchronous=NORMAL;
 PRAGMA temp_store=MEMORY;
 CREATE TABLE IF NOT EXISTS candidates (
  candidate_id TEXT PRIMARY KEY,
  name TEXT,
  location TEXT,
  remote INTEGER,
  experience_years REAL,
  experience_years_eng REAL, -- инженерный опыт (после фильтра HR)
  experience_confidence REAL,
  salary_min INTEGER,
  salary_max INTEGER,
  salary_confidence REAL,
  english_level TEXT,
  roles_json TEXT,
  skills_json TEXT,
  primary_languages_json TEXT,
  backend_focus INTEGER,
  roles_norm TEXT,   -- "|backend|devops|"
  skills_norm TEXT,  -- "|python|k8s|"
  created_at TEXT DEFAULT (datetime('now')),
  updated_at TEXT DEFAULT (datetime('now'))
 );
 CREATE TABLE IF NOT EXISTS candidate_contacts (
  contact_type TEXT NOT NULL,  -- email/phone/tg/github/linkedin/url
  contact_value TEXT NOT NULL, -- normalized
  candidate_id TEXT NOT NULL,
  created_at TEXT DEFAULT (datetime('now')),
  PRIMARY KEY(contact_type, contact_value),
  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
 );
 CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id);
 CREATE TABLE IF NOT EXISTS resumes (
  resume_id TEXT PRIMARY KEY,
  candidate_id TEXT NOT NULL,
  sha256 TEXT,
  simhash TEXT,
  clean_text TEXT NOT NULL,
  raw_text TEXT,
  extraction_json TEXT,
  llm_summary TEXT,
  llm_tags_json TEXT,
  extract_method TEXT,
  extract_quality_score REAL,
  extract_quality_flags TEXT,
  extract_pages_json TEXT,
  doc_type TEXT,
  doc_type_confidence REAL,
  parse_method TEXT,
  parse_version TEXT,
  sections_json TEXT,
  is_active INTEGER DEFAULT 1,
  duplicate_of_resume_id TEXT,
  file_path TEXT,
  file_mtime INTEGER,
  file_size INTEGER,
  created_at TEXT DEFAULT (datetime('now')),
  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
 );
 CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL;
 CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id);
 CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active);
 CREATE TABLE IF NOT EXISTS sources (
  source_id INTEGER PRIMARY KEY AUTOINCREMENT,
  resume_id TEXT NOT NULL,
  export_path TEXT,
  chat_title TEXT,
  message_id TEXT,
  message_date TEXT,
  origin_type TEXT,
  original_file_path TEXT,
  original_file_name TEXT,
  extra_json TEXT,
  created_at TEXT DEFAULT (datetime('now')),
  FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
 );
 CREATE TABLE IF NOT EXISTS files_seen (
  sha256 TEXT PRIMARY KEY,
  size INTEGER,
  mtime INTEGER,
  canonical_resume_id TEXT,
  first_seen_at TEXT DEFAULT (datetime('now')),
  last_seen_at TEXT DEFAULT (datetime('now'))
 );
 CREATE TABLE IF NOT EXISTS simhash_buckets (
  bucket INTEGER NOT NULL,
  band INTEGER NOT NULL,
  resume_id TEXT NOT NULL,
  PRIMARY KEY(bucket, band, resume_id),
  FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
 );
 CREATE TABLE IF NOT EXISTS candidate_skills (
  candidate_id TEXT NOT NULL,
  skill_id TEXT NOT NULL,
  skill_label TEXT,
  confidence REAL,
  source TEXT,
  evidence TEXT,
  created_at TEXT DEFAULT (datetime('now')),
  PRIMARY KEY(candidate_id, skill_id),
  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
 );
 CREATE TABLE IF NOT EXISTS candidate_roles (
  candidate_id TEXT NOT NULL,
  role TEXT NOT NULL,
  confidence REAL,
  source TEXT,
  evidence TEXT,
  created_at TEXT DEFAULT (datetime('now')),
  PRIMARY KEY(candidate_id, role),
  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
 );
 CREATE TABLE IF NOT EXISTS candidate_languages (
  candidate_id TEXT NOT NULL,
  language TEXT NOT NULL,
  level TEXT,
  confidence REAL,
  source TEXT,
  evidence TEXT,
  created_at TEXT DEFAULT (datetime('now')),
  PRIMARY KEY(candidate_id, language),
  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
 );
 CREATE TABLE IF NOT EXISTS positions (
  position_id TEXT PRIMARY KEY,
  resume_id TEXT NOT NULL,
  candidate_id TEXT NOT NULL,
  title TEXT,
  company TEXT,
  date_from TEXT,
  date_to TEXT,
  is_current INTEGER,
  description TEXT,
  stack_json TEXT,
  created_at TEXT DEFAULT (datetime('now')),
  FOREIGN KEY(resume_id) REFERENCES resumes(resume_id),
  FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
 );
 CREATE TABLE IF NOT EXISTS llm_cache (
  cache_key TEXT PRIMARY KEY,
  model TEXT,
  result_json TEXT,
  created_at TEXT DEFAULT (datetime('now'))
 );
 -- Full-text index (FTS5): contentless
 CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5(
  resume_id UNINDEXED,
  candidate_id UNINDEXED,
  clean_text,
  tokenize='unicode61 remove_diacritics 2'
 );
 -- --- Triggers to keep FTS synced with resumes ---
 -- Insert
 CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts
 AFTER INSERT ON resumes
 BEGIN
  DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
  INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
  SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
  WHERE NEW.is_active = 1;
 END;
 -- Delete
 CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts
 AFTER DELETE ON resumes
 BEGIN
  DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id;
 END;
 -- Update (text/active/candidate)
 CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts
 AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes
 BEGIN
  DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
  INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
  SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
  WHERE NEW.is_active = 1;
 END;
 """
 def connect(db_path: str) -> sqlite3.Connection:
    Path(db_path).parent.mkdir(parents=True, exist_ok=True)
    con = sqlite3.connect(db_path)
    con.row_factory = sqlite3.Row
    return con
 def _table_exists(con: sqlite3.Connection, name: str) -> bool:
    row = con.execute(
        "SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1",
        (name,),
    ).fetchone()
    return row is not None
 def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool:
    cur = con.execute(f"PRAGMA table_info({table})")
    for r in cur.fetchall():
        if r["name"] == column:
            return True
    return False
 def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None:
    if not _table_exists(con, table):
        return
    if _column_exists(con, table, column):
        return
    con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}")
 def _ensure_fts_backfilled(con: sqlite3.Connection) -> None:
    """
    Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes.
    Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0.
    """
    if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"):
        return
    try:
        resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"])
        fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"])
    except Exception:
        return
    if resumes_cnt <= 0:
        return
    # Любое несовпадение -> rebuild (убирает и пустоту, и дубли)
    if fts_cnt != resumes_cnt:
        con.execute("DELETE FROM resumes_fts")
        con.execute(
            """
            INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
            SELECT resume_id, candidate_id, clean_text
            FROM resumes
            WHERE is_active=1
            """
        )
        con.commit()
 def init_db(con: sqlite3.Connection) -> None:
    con.executescript(SCHEMA)
    # Lightweight migrations for existing DBs (safe to re-run)
    _add_column_if_missing(con, "candidates", "experience_years_eng", "REAL")
    _add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT")
    _add_column_if_missing(con, "candidates", "backend_focus", "INTEGER")
    _add_column_if_missing(con, "resumes", "llm_summary", "TEXT")
    _add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT")
    _add_column_if_missing(con, "resumes", "extract_method", "TEXT")
    _add_column_if_missing(con, "resumes", "extract_quality_score", "REAL")
    _add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT")
    _add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT")
    _add_column_if_missing(con, "resumes", "doc_type", "TEXT")
    _add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL")
    _add_column_if_missing(con, "resumes", "parse_method", "TEXT")
    _add_column_if_missing(con, "resumes", "parse_version", "TEXT")
    _add_column_if_missing(con, "resumes", "sections_json", "TEXT")
    if not _table_exists(con, "llm_cache"):
        con.execute(
            """
            CREATE TABLE IF NOT EXISTS llm_cache (
              cache_key TEXT PRIMARY KEY,
              model TEXT,
              result_json TEXT,
              created_at TEXT DEFAULT (datetime('now'))
            )
            """
        )
    con.commit()
    _ensure_fts_backfilled(con)
--- a/dedup/simhash.py
+++ b/dedup/simhash.py
@@ -0,0 +1,41 @@
 from __future__ import annotations
 import hashlib
 import re
 from typing import List, Tuple
 def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    for chunk in iter(lambda: f.read(chunk_size), b""):
        h.update(chunk)
    return h.hexdigest()
 def sha256_file(path) -> str:
    with open(path, "rb") as f:
        return sha256_file_bytes_iter(f)
 def sha1_str(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()
 def simhash64(text: str) -> int:
    tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower())
    if not tokens:
        return 0
    v = [0] * 64
    for tok in tokens:
        h = hashlib.md5(tok.encode("utf-8")).digest()
        x = int.from_bytes(h[:8], "big", signed=False)
        for i in range(64):
            v[i] += 1 if ((x >> i) & 1) else -1
    out = 0
    for i in range(64):
        if v[i] > 0:
            out |= (1 << i)
    return out
 def hamming64(a: int, b: int) -> int:
    return (a ^ b).bit_count()
 def simhash_bands(x: int) -> List[Tuple[int, int]]:
    # 4 bands x 16 bits
    return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]
--- a/extract/clean.py
+++ b/extract/clean.py
@@ -0,0 +1,39 @@
 from __future__ import annotations
 import re
 from collections import Counter
 import unicodedata
 RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I)
 RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U)
 RE_MULTI_SPACE = re.compile(r"[ \t]+")
 RE_MULTI_NL = re.compile(r"\n{3,}")
 _INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"]
 _BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]")
 def normalize_text(raw: str) -> str:
    text = raw.replace("\r\n", "\n").replace("\r", "\n")
    for ch in _INVISIBLE_CHARS:
        text = text.replace(ch, "")
    text = _BIDI_CTRL_RE.sub("", text)
    # remove most control/format chars but keep line breaks and tabs
    text = "".join(
        ch for ch in text
        if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C")
    )
    text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b"))
    lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")]
    lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)]
    counts = Counter(lines)
    filtered = []
    for ln in lines:
        if counts[ln] >= 4 and len(ln) <= 90:
            continue
        filtered.append(ln)
    text = "\n".join(filtered)
    text = RE_MULTI_NL.sub("\n\n", text).strip()
    return text
 def to_fts_text(clean: str) -> str:
    return re.sub(r"\s+", " ", clean).strip()
--- a/extract/doc_type.py
+++ b/extract/doc_type.py
@@ -0,0 +1,134 @@
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from typing import List, Optional
@dataclass
 class DocTypeResult:
    doc_type: str
    confidence: float
    signals: List[str]
 _HH_PATTERNS = [
    (re.compile(r"\bhh\.ru\b", re.I), 2.0, "hh_domain"),
    (re.compile(r"\bheadhunter\b", re.I), 2.0, "headhunter"),
    (re.compile(r"\bрезюме\s+обновлено\b", re.I), 2.5, "hh_resume_updated"),
    (re.compile(r"\bжелаемая\s+должность\b", re.I), 1.2, "hh_desired_role"),
    (re.compile(r"\bключевые\s+навыки\b", re.I), 1.0, "hh_key_skills"),
    (re.compile(r"\bопыт\s+работы\b", re.I), 0.8, "hh_experience"),
 ]
 _LI_PATTERNS = [
    (re.compile(r"\blinkedin\b", re.I), 2.5, "li_brand"),
    (re.compile(r"\blinkedin\.com\b", re.I), 2.0, "li_domain"),
    (re.compile(r"\bexperience\b", re.I), 0.9, "li_experience"),
    (re.compile(r"\beducation\b", re.I), 0.9, "li_education"),
    (re.compile(r"\bskills\b", re.I), 0.9, "li_skills"),
    (re.compile(r"\babout\b", re.I), 0.6, "li_about"),
 ]
 _PPTX_PATTERNS = [
    (re.compile(r"\bslide\b", re.I), 1.0, "pptx_slide"),
    (re.compile(r"\bpowerpoint\b", re.I), 1.3, "pptx_powerpoint"),
    (re.compile(r"\bpptx\b", re.I), 1.3, "pptx_ext"),
    (re.compile(r"\bpresentation\b", re.I), 0.8, "pptx_presentation"),
 ]
 def _score_patterns(text: str, patterns) -> tuple[float, List[str]]:
    score = 0.0
    signals: List[str] = []
    for rx, weight, name in patterns:
        if rx.search(text):
            score += weight
            signals.append(name)
    return score, signals
 def _confidence_from_score(score: float) -> float:
    if score >= 4.0:
        return 0.92
    if score >= 3.0:
        return 0.85
    if score >= 2.0:
        return 0.75
    if score >= 1.2:
        return 0.62
    if score > 0.0:
        return 0.50
    return 0.30
 def detect_doc_type(clean_text: str, file_ext: Optional[str] = None) -> DocTypeResult:
    lines = [ln.strip() for ln in (clean_text or "").splitlines() if ln.strip()]
    head_lines = lines[:80]
    head_text = "\n".join(head_lines)
    head_lc = head_text.lower()
    signals: List[str] = []
    hh_score, hh_signals = _score_patterns(head_text, _HH_PATTERNS)
    li_score, li_signals = _score_patterns(head_text, _LI_PATTERNS)
    pptx_score, pptx_signals = _score_patterns(head_text, _PPTX_PATTERNS)
    if file_ext and file_ext.lower() in (".pptx", ".ppt"):
        pptx_score += 2.0
        signals.append("pptx_ext")
    signals.extend(hh_signals + li_signals + pptx_signals)
    # One-page heuristic: short resumes with dense content
    total_len = len(clean_text or "")
    one_page_score = 0.0
    if len(lines) <= 70 and total_len <= 4500:
        one_page_score = 2.2
        signals.append("one_page_short")
    elif len(lines) <= 90 and total_len <= 6500:
        one_page_score = 1.6
        signals.append("one_page_medium")
    # Scan heuristic: very low textual content
    letters = sum(ch.isalpha() for ch in clean_text or "")
    total = max(1, len(clean_text or ""))
    letter_ratio = letters / total
    scan_score = 0.0
    if total_len < 200 or letter_ratio < 0.12:
        scan_score = 3.2
        signals.append("scan_low_text")
        if file_ext and file_ext.lower() in (".pdf", ".png", ".jpg", ".jpeg", ".tiff"):
            scan_score += 0.6
            signals.append("scan_file_ext")
    candidates = [
        ("hh_ru", hh_score),
        ("linkedin_pdf", li_score),
        ("pptx_export", pptx_score),
        ("one_page", one_page_score),
        ("scan_pdf", scan_score),
    ]
    doc_type, best_score = max(candidates, key=lambda x: x[1])
    if best_score <= 0.0:
        base = "generic_pdf" if (file_ext and file_ext.lower() == ".pdf") else "generic"
        return DocTypeResult(doc_type=base, confidence=0.35, signals=signals)
    confidence = _confidence_from_score(best_score)
    # If scan is detected strongly, prefer it
    if doc_type == "scan_pdf" and confidence >= 0.8:
        return DocTypeResult(doc_type="scan_pdf", confidence=confidence, signals=signals)
    # Split one-page into ru/en
    if doc_type == "one_page":
        if _looks_cyrillic(head_text):
            return DocTypeResult(doc_type="one_page_ru", confidence=confidence, signals=signals)
        return DocTypeResult(doc_type="one_page_en", confidence=confidence, signals=signals)
    return DocTypeResult(doc_type=doc_type, confidence=confidence, signals=signals)
 def _looks_cyrillic(text: str) -> bool:
    cyr = len(re.findall(r"[А-Яа-яЁё]", text))
    lat = len(re.findall(r"[A-Za-z]", text))
    return cyr > lat and cyr >= 10
--- a/extract/experience.py
+++ b/extract/experience.py
@@ -0,0 +1,159 @@
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from datetime import date
 from typing import Dict, List, Optional, Tuple
 # Month maps (EN + RU)
 MONTHS = {
    "jan": 1, "january": 1, "янв": 1, "январ": 1,
    "feb": 2, "february": 2, "фев": 2, "феврал": 2,
    "mar": 3, "march": 3, "мар": 3, "март": 3,
    "apr": 4, "april": 4, "апр": 4, "апрел": 4,
    "may": 5, "май": 5,
    "jun": 6, "june": 6, "июн": 6, "июнь": 6,
    "jul": 7, "july": 7, "июл": 7, "июль": 7,
    "aug": 8, "august": 8, "авг": 8, "август": 8,
    "sep": 9, "september": 9, "сен": 9, "сент": 9,
    "oct": 10, "october": 10, "окт": 10, "октя": 10,
    "nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
    "dec": 12, "december": 12, "дек": 12, "дека": 12,
 }
 PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
 # Direct "X years" patterns
 DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
 # Dates like 03.2019, 2019, Jan 2020, янв 2020
 MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
 YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
 MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
 # Range separators
 RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
@dataclass
 class ExpResult:
    years: Optional[float]
    confidence: float
    debug: Dict
 def _clamp_years(y: float) -> Optional[float]:
    if 0.0 <= y <= 45.0:
        return y
    return None
 def _parse_mon(mon: str) -> Optional[int]:
    m = mon.strip().lower()
    m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
    # allow prefixes: "январ", "феврал"
    for k, v in MONTHS.items():
        if m.startswith(k):
            return v
    return None
 def _as_ymd(y: int, m: int) -> date:
    return date(y, m, 1)
 def _parse_one_date(s: str) -> Optional[date]:
    s = s.strip()
    if PRESENT_RE.search(s):
        today = date.today()
        return date(today.year, today.month, 1)
    m1 = MMYYYY_RE.search(s)
    if m1:
        mm = int(m1.group(1))
        yy = int(m1.group(2))
        return _as_ymd(yy, mm)
    m2 = MON_YYYY_RE.search(s)
    if m2:
        mon = _parse_mon(m2.group(1))
        yy = int(m2.group(2))
        if mon:
            return _as_ymd(yy, mon)
    m3 = YYYY_RE.search(s)
    if m3:
        yy = int(m3.group(1))
        return _as_ymd(yy, 1)
    return None
 def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
    if not intervals:
        return []
    intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
    merged = [intervals[0]]
    for s, e in intervals[1:]:
        ls, le = merged[-1]
        if s <= le:
            merged[-1] = (ls, max(le, e))
        else:
            merged.append((s, e))
    return merged
 def _months_between(a: date, b: date) -> int:
    # month-level difference (inclusive-ish): b >= a
    return (b.year - a.year) * 12 + (b.month - a.month)
 def extract_experience(text: str) -> ExpResult:
    debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
    # 1) Direct years
    directs = []
    for m in DIRECT_YEARS_RE.finditer(text):
        try:
            v = float(m.group(1).replace(",", "."))
            if 0 <= v <= 45:
                directs.append(v)
                debug["direct_matches"].append({"match": m.group(0), "value": v})
        except Exception:
            pass
    if directs:
        years = _clamp_years(max(directs))
        return ExpResult(years=years, confidence=0.90, debug=debug)
    # 2) Ranges in lines: try to detect "start - end"
    intervals: List[Tuple[date, date]] = []
    for line in text.splitlines():
        ln = line.strip()
        if len(ln) < 7:
            continue
        # require range separator
        if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
            continue
        rr = RANGE_RE.match(ln)
        if not rr:
            continue
        a = rr.group("a")
        b = rr.group("b")
        da = _parse_one_date(a)
        db = _parse_one_date(b)
        if da and db:
            if db < da:
                da, db = db, da
            # cap extremely old
            if da.year < 1990:
                continue
            intervals.append((da, db))
            debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
    intervals = _merge_intervals(intervals)
    debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
    if not intervals:
        return ExpResult(years=None, confidence=0.0, debug=debug)
    total_months = 0
    for s, e in intervals:
        total_months += max(0, _months_between(s, e))
    years = round(total_months / 12.0, 2)
    years = _clamp_years(years) if years is not None else None
    # confidence depends on amount of evidence
    conf = 0.70 if total_months >= 12 else 0.55
    return ExpResult(years=years, confidence=conf, debug=debug)
--- a/extract/experience_timeline.py
+++ b/extract/experience_timeline.py
@@ -0,0 +1,144 @@
 from __future__ import annotations
 import re
 from dataclasses import dataclass, asdict
 from datetime import date
 from typing import List, Optional
 MONTHS = {
    "jan": 1, "january": 1, "янв": 1, "январ": 1,
    "feb": 2, "february": 2, "фев": 2, "феврал": 2,
    "mar": 3, "march": 3, "мар": 3, "март": 3,
    "apr": 4, "april": 4, "апр": 4, "апрел": 4,
    "may": 5, "май": 5,
    "jun": 6, "june": 6, "июн": 6, "июнь": 6,
    "jul": 7, "july": 7, "июл": 7, "июль": 7,
    "aug": 8, "august": 8, "авг": 8, "август": 8,
    "sep": 9, "september": 9, "сен": 9, "сент": 9,
    "oct": 10, "october": 10, "окт": 10, "октя": 10,
    "nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
    "dec": 12, "december": 12, "дек": 12, "дека": 12,
 }
 PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
 MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
 YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
 MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
 RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
 YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—|–|-|to|по)\s*\d{4}\s*$", re.I)
 EDU_CONTEXT_RE = re.compile(
    r"\b("
    r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
    r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
    r")\b",
    re.I,
 )
@dataclass
 class Position:
    title: Optional[str]
    company: Optional[str]
    date_from: Optional[str]
    date_to: Optional[str]
    is_current: Optional[bool]
    description: Optional[str]
 def _parse_mon(mon: str) -> Optional[int]:
    m = mon.strip().lower()
    m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
    for k, v in MONTHS.items():
        if m.startswith(k):
            return v
    return None
 def _as_ymd(y: int, m: int) -> date:
    return date(y, m, 1)
 def _parse_one_date(s: str) -> Optional[date]:
    s = s.strip()
    if PRESENT_RE.search(s):
        today = date.today()
        return date(today.year, today.month, 1)
    m1 = MMYYYY_RE.search(s)
    if m1:
        mm = int(m1.group(1))
        yy = int(m1.group(2))
        return _as_ymd(yy, mm)
    m2 = MON_YYYY_RE.search(s)
    if m2:
        mon = _parse_mon(m2.group(1))
        yy = int(m2.group(2))
        if mon:
            return _as_ymd(yy, mon)
    m3 = YYYY_RE.search(s)
    if m3:
        yy = int(m3.group(1))
        return _as_ymd(yy, 1)
    return None
 def extract_positions(text: str, max_items: int = 40) -> List[Position]:
    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
    positions: List[Position] = []
    i = 0
    while i < len(lines) and len(positions) < max_items:
        ln = lines[i]
        if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
            i += 1
            continue
        rr = RANGE_RE.match(ln)
        if not rr:
            i += 1
            continue
        ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
        if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
            i += 1
            continue
        da = _parse_one_date(rr.group("a"))
        db = _parse_one_date(rr.group("b"))
        if not da or not db:
            i += 1
            continue
        if da.year < 1990:
            i += 1
            continue
        is_current = PRESENT_RE.search(rr.group("b")) is not None
        title = None
        company = None
        desc_lines: List[str] = []
        if i + 1 < len(lines):
            if EDU_CONTEXT_RE.search(lines[i + 1]):
                i += 1
                continue
            header = lines[i + 1]
            parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
            if parts:
                title = parts[0]
                if len(parts) > 1:
                    company = parts[1]
        j = i + 2
        while j < len(lines):
            if any(x in lines[j] for x in ("—", "–", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
                break
            desc_lines.append(lines[j])
            j += 1
        positions.append(
            Position(
                title=title,
                company=company,
                date_from=da.isoformat(),
                date_to=db.isoformat(),
                is_current=is_current,
                description="\n".join(desc_lines).strip() if desc_lines else None,
            )
        )
        i = j
    return positions
 def positions_to_dicts(items: List[Position]) -> List[dict]:
    return [asdict(p) for p in items]
--- a/extract/llm.py
+++ b/extract/llm.py
@@ -0,0 +1,585 @@
 from __future__ import annotations
 import hashlib
 import json
 import os
 import re
 import sqlite3
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 try:
    import httpx  # type: ignore
 except Exception:  # pragma: no cover
    httpx = None  # type: ignore
 def resolve_llm_runtime() -> Dict[str, str]:
    """
    Resolve OpenAI-compatible runtime config.
    Supports both generic vars and Mistral aliases:
      - generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
      - mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
    """
    provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
    base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
    model = (os.environ.get("LLM_MODEL") or "").strip()
    api_key = (os.environ.get("LLM_API_KEY") or "").strip()
    mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
    mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
    mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()
    if not api_key and mistral_key:
        api_key = mistral_key
    if not model and mistral_model:
        model = mistral_model
    if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
        base_url = mistral_base
    if base_url:
        base_url = base_url.rstrip("/")
    if not provider:
        if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
            provider = "mistral"
        else:
            provider = "generic"
    return {
        "provider": provider,
        "base_url": base_url,
        "model": model,
        "api_key": api_key,
    }
 # ------------- Public API -------------
 def llm_parse_enabled() -> bool:
    """
    Enabled only if httpx is available and both base_url/model are resolved.
    Opt-out via LLM_PARSE_ENABLED=0.
    """
    if httpx is None:
        return False
    if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
        return False
    runtime = resolve_llm_runtime()
    return bool(runtime["base_url"]) and bool(runtime["model"])
 _PROMPT_VERSION = "v3_sections_doc_type"
 _REVIEW_PROMPT_VERSION = "v1_review_merge"
@dataclass
 class LLMExtraction:
    roles: List[str]
    skills: List[str]
    primary_languages: List[str]
    seniority: Optional[str]
    backend_focus: Optional[bool]
    experience_years_total: Optional[float]
    experience_years_engineering: Optional[float]
    english_level: Optional[str]
    location: Optional[str]
    remote_ok: Optional[bool]
    salary_min_usd: Optional[int]
    salary_max_usd: Optional[int]
    salary_min_rub: Optional[int]
    salary_max_rub: Optional[int]
    highlights: List[str]
    keywords: List[str]
    @staticmethod
    def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
        def _as_list(v: Any) -> List[str]:
            if v is None:
                return []
            if isinstance(v, list):
                return [str(x).strip() for x in v if str(x).strip()]
            s = str(v).strip()
            return [s] if s else []
        def _as_float(v: Any) -> Optional[float]:
            try:
                return float(v)
            except Exception:
                return None
        def _as_int(v: Any) -> Optional[int]:
            try:
                return int(float(v))
            except Exception:
                return None
        def _as_bool(v: Any) -> Optional[bool]:
            if isinstance(v, bool):
                return v
            if v is None:
                return None
            s = str(v).strip().lower()
            if s in ("true", "1", "yes", "y"):
                return True
            if s in ("false", "0", "no", "n"):
                return False
            return None
        return LLMExtraction(
            roles=_as_list(obj.get("roles")),
            skills=_as_list(obj.get("skills")),
            primary_languages=_as_list(obj.get("primary_languages")),
            seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
            backend_focus=_as_bool(obj.get("backend_focus")),
            experience_years_total=_as_float(obj.get("experience_years_total")),
            experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
            english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
            location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
            remote_ok=_as_bool(obj.get("remote_ok")),
            salary_min_usd=_as_int(obj.get("salary_min_usd")),
            salary_max_usd=_as_int(obj.get("salary_max_usd")),
            salary_min_rub=_as_int(obj.get("salary_min_rub")),
            salary_max_rub=_as_int(obj.get("salary_max_rub")),
            highlights=_as_list(obj.get("highlights")),
            keywords=_as_list(obj.get("keywords")),
        )
 def llm_extract_profile(
    clean_text: str,
    *,
    con: Optional[sqlite3.Connection] = None,
    doc_type: Optional[str] = None,
    sections: Optional[Dict[str, str]] = None,
 ) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
    """
    Returns (LLMExtraction | None, debug_info).
    - Uses cache on disk/sqlite to keep throughput high.
    - Silently degrades to None on any failure.
    """
    runtime = resolve_llm_runtime()
    dbg: Dict[str, Any] = {
        "enabled": llm_parse_enabled(),
        "provider": runtime.get("provider"),
        "model": runtime.get("model"),
        "from_cache": False,
        "cache_backend": None,
        "error": None,
        "prompt_version": _PROMPT_VERSION,
    }
    if not llm_parse_enabled():
        return None, dbg
    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
    cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"
    payload = _build_payload(
        clean_text,
        doc_type=doc_type,
        sections=sections,
        prompt_version=_PROMPT_VERSION,
        temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
        max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
        system_prompt="You output ONLY JSON for structured resume extraction.",
        prompt_template=_PROMPT_TEMPLATE,
    )
    data = _cached_llm_json_call(
        con=con,
        cache_key=cache_key,
        model=runtime["model"],
        payload=payload,
        dbg=dbg,
    )
    if data is None:
        return None, dbg
    return LLMExtraction.from_obj(data), dbg
 def llm_review_profile(
    clean_text: str,
    *,
    draft: Dict[str, Any],
    con: Optional[sqlite3.Connection] = None,
    doc_type: Optional[str] = None,
    sections: Optional[Dict[str, str]] = None,
 ) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
    """
    Second-pass validator:
    - Takes already parsed JSON (draft)
    - Re-checks every field against resume text
    - Returns corrected extraction for safe merge in pipeline
    """
    runtime = resolve_llm_runtime()
    dbg: Dict[str, Any] = {
        "enabled": llm_parse_enabled(),
        "provider": runtime.get("provider"),
        "model": runtime.get("model"),
        "from_cache": False,
        "cache_backend": None,
        "error": None,
        "prompt_version": _REVIEW_PROMPT_VERSION,
        "quality_score": None,
        "changed_fields": [],
        "issues_found": [],
    }
    if not llm_parse_enabled():
        return None, dbg
    clean_draft = _sanitize_review_draft(draft)
    draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
    draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
    cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"
    payload = _build_payload(
        clean_text,
        doc_type=doc_type,
        sections=sections,
        prompt_version=_REVIEW_PROMPT_VERSION,
        temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
        max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
        system_prompt="You output ONLY JSON for resume parsing quality review.",
        prompt_template=_REVIEW_PROMPT_TEMPLATE,
        extra_vars={"draft_json": draft_blob},
    )
    data = _cached_llm_json_call(
        con=con,
        cache_key=cache_key,
        model=runtime["model"],
        payload=payload,
        dbg=dbg,
    )
    if data is None:
        return None, dbg
    corrected_obj: Dict[str, Any]
    if isinstance(data.get("corrected"), dict):
        corrected_obj = data["corrected"]
    else:
        corrected_obj = data
    dbg["quality_score"] = _as_float(data.get("quality_score"))
    dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
    dbg["issues_found"] = _as_str_list(data.get("issues_found"))
    return LLMExtraction.from_obj(corrected_obj), dbg
 # ------------- Internal helpers -------------
 _PROMPT_TEMPLATE = """
 Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
 Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
 Схема:
 {{
  "roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
  "skills": ["python","go","k8s","postgres","react", "..."],
  "primary_languages": ["python","go","java","c++", "..."],
  "seniority": "intern|junior|middle|senior|lead|principal|null",
  "backend_focus": true|false|null,
  "experience_years_total": number|null,
  "experience_years_engineering": number|null,
  "english_level": "A1|A2|B1|B2|C1|C2|null",
  "location": "city, country|null",
  "remote_ok": true|false|null,
  "salary_min_usd": int|null,
  "salary_max_usd": int|null,
  "salary_min_rub": int|null,
  "salary_max_rub": int|null,
  "highlights": ["кратко достижения (1-2 предложения)"],
  "keywords": ["уникальные ключевые слова, продукты или домены"]
 }}
 Не включай контактные данные в skills/keywords.
 Detected doc_type: {doc_type}
 Sections (if present):
 {sections_block}
 Full text snippet (use only if needed):
 ```TEXT
 {resume_text}
 ```
 """
 _REVIEW_PROMPT_TEMPLATE = """
 Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
 У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
 Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.
 Верни JSON строго такой формы:
 {{
  "corrected": {{
    "roles": ["..."],
    "skills": ["..."],
    "primary_languages": ["..."],
    "seniority": "intern|junior|middle|senior|lead|principal|null",
    "backend_focus": true|false|null,
    "experience_years_total": number|null,
    "experience_years_engineering": number|null,
    "english_level": "A1|A2|B1|B2|C1|C2|null",
    "location": "city, country|null",
    "remote_ok": true|false|null,
    "salary_min_usd": int|null,
    "salary_max_usd": int|null,
    "salary_min_rub": int|null,
    "salary_max_rub": int|null,
    "highlights": ["..."],
    "keywords": ["..."]
  }},
  "changed_fields": ["field_name", "..."],
  "issues_found": ["кратко что было неверно/сомнительно", "..."],
  "quality_score": 0.0
 }}
 Черновик JSON:
 ```DRAFT
 {draft_json}
 ```
 Detected doc_type: {doc_type}
 Sections (if present):
 {sections_block}
 Full text snippet (use only if needed):
 ```TEXT
 {resume_text}
 ```
 """
 def _trim_text(text: str, max_len: int = 9000) -> str:
    """
    Keep head and tail to preserve summary + recent projects.
    """
    if len(text) <= max_len:
        return text
    head = text[: max_len // 2]
    tail = text[-max_len // 2 :]
    return head + "\n...\n" + tail
 def _build_payload(
    clean_text: str,
    *,
    doc_type: Optional[str],
    sections: Optional[Dict[str, str]],
    prompt_version: str,
    temperature: float,
    max_tokens: int,
    system_prompt: str,
    prompt_template: str,
    extra_vars: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
    runtime = resolve_llm_runtime()
    base_url = runtime["base_url"]
    model = runtime["model"]
    sections_block = _build_sections_block(sections)
    tpl_vars = {
        "resume_text": _trim_text(clean_text),
        "doc_type": (doc_type or "unknown"),
        "sections_block": sections_block or "(no sections detected)",
    }
    if extra_vars:
        tpl_vars.update(extra_vars)
    prompt = prompt_template.format(**tpl_vars)
    return {
        "base_url": base_url,
        "model": model,
        "prompt_version": prompt_version,
        "payload": {
            "model": model,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ],
            "temperature": temperature,
            "max_tokens": max_tokens,
        },
        "headers": _build_headers(runtime),
        "timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
    }
 def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
    headers = {"Content-Type": "application/json"}
    api_key = runtime.get("api_key", "")
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    return headers
 def _cached_llm_json_call(
    *,
    con: Optional[sqlite3.Connection],
    cache_key: str,
    model: str,
    payload: Dict[str, Any],
    dbg: Dict[str, Any],
 ) -> Optional[Dict[str, Any]]:
    data = _cache_get_sqlite(con, cache_key)
    if data:
        dbg["from_cache"] = True
        dbg["cache_backend"] = "sqlite"
        return data
    cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
    cache_ok = True
    try:
        cache_dir.mkdir(parents=True, exist_ok=True)
    except Exception:
        cache_ok = False
    safe_name = cache_key.replace(":", "_")
    cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None
    if cache_path and cache_path.exists():
        try:
            data = json.loads(cache_path.read_text(encoding="utf-8"))
            dbg["from_cache"] = True
            dbg["cache_backend"] = "disk"
            return data
        except Exception:
            pass
    try:
        data = _llm_call_json(payload)
        if con:
            _cache_put_sqlite(con, cache_key, model, data)
        if cache_path:
            cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
        return data
    except Exception as e:  # pragma: no cover - network/LLM failures
        dbg["error"] = repr(e)
        return None
 def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
    if httpx is None:
        raise RuntimeError("httpx is not installed")
    base_url: str = task["base_url"]
    payload: Dict[str, Any] = task["payload"]
    timeout = float(task.get("timeout", 18.0))
    with httpx.Client(timeout=timeout) as client:
        r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
        r.raise_for_status()
        data = r.json()
    content = data["choices"][0]["message"]["content"]
    if isinstance(content, list):
        parts = []
        for block in content:
            if isinstance(block, dict):
                parts.append(str(block.get("text") or ""))
            else:
                parts.append(str(block))
        content = "\n".join(parts)
    content = str(content)
    m = re.search(r"\{.*\}", content, flags=re.S)
    if not m:
        raise ValueError("LLM did not return JSON")
    return json.loads(m.group(0))
 def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
    if not sections:
        return ""
    parts: List[str] = []
    order = [
        ("about", "ABOUT"),
        ("skills", "SKILLS"),
        ("experience", "EXPERIENCE"),
        ("education", "EDUCATION"),
        ("contacts", "CONTACTS"),
    ]
    for key, label in order:
        text = sections.get(key)
        if not text:
            continue
        snippet = _trim_text(text, max_len=1800)
        parts.append(f"[{label}]\n{snippet}")
    return "\n\n".join(parts)
 def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
    if not isinstance(draft, dict):
        draft = {}
    allowed = {
        "roles",
        "skills",
        "primary_languages",
        "seniority",
        "backend_focus",
        "experience_years_total",
        "experience_years_engineering",
        "english_level",
        "location",
        "remote_ok",
        "salary_min_usd",
        "salary_max_usd",
        "salary_min_rub",
        "salary_max_rub",
        "highlights",
        "keywords",
    }
    cleaned = {k: v for k, v in draft.items() if k in allowed}
    return asdict(LLMExtraction.from_obj(cleaned))
 def _as_float(v: Any) -> Optional[float]:
    try:
        x = float(v)
    except Exception:
        return None
    if x < 0:
        return None
    if x > 1.0:
        return 1.0
    return x
 def _as_str_list(v: Any) -> List[str]:
    if v is None:
        return []
    if isinstance(v, list):
        return [str(x).strip() for x in v if str(x).strip()]
    s = str(v).strip()
    return [s] if s else []
 def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
    if con is None:
        return None
    try:
        row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
        if row and row["result_json"]:
            return json.loads(row["result_json"])
    except Exception:
        return None
    return None
 def _cache_put_sqlite(
    con: Optional[sqlite3.Connection],
    cache_key: str,
    model: str,
    data: Dict[str, Any],
 ) -> None:
    if con is None:
        return
    try:
        con.execute(
            "INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
            (cache_key, model, json.dumps(data, ensure_ascii=False)),
        )
    except Exception:
        return
--- a/extract/parse.py
+++ b/extract/parse.py
@@ -0,0 +1,659 @@
 from __future__ import annotations
 import json
 import re
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 from tg_resume_db.normalize import normalize_skill
 from tg_resume_db.extract.experience import extract_experience
 EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
 EMAIL_SPLIT_RE = re.compile(
    r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
    r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
    re.I,
 )
 PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
 TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
 GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
 LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
 URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
 EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
 EN_TEXT_RE = re.compile(
    r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
    re.I,
 )
 EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
 REMOTE_RE = re.compile(
    r"\b("
    r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
    r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
    r")\b",
    re.I,
 )
 # Salary (rough)
 CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
 NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
 SALARY_HINT_RE = re.compile(
    r"\b("
    r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
    r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
    r")\b",
    re.I,
 )
 PAY_TOKEN_RE = re.compile(
    r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
    re.I,
 )
 SALARY_NOISE_RE = re.compile(
    r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
    r"companies?|followers?|downloads?|clients?)\b",
    re.I,
 )
 SECTION_HEADER_RE = re.compile(
    r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
    re.I,
 )
 LOCATION_CITY_COUNTRY_RE = re.compile(
    r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
 )
 # --- SKILLS & ROLES ---
 SKILLS = {
    "python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
    "sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
    "aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
    "pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
 }
 _SKILL_ALIASES: Dict[str, List[str]] = {
    "javascript": ["java script", "java-script", "js"],
    "typescript": ["type script", "type-script", "ts"],
    "postgresql": ["postgres", "postgre sql", "postgre-sql"],
    "graphql": ["graph ql"],
    "grpc": ["g rpc"],
 }
 def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
    patterns: List[Tuple[str, re.Pattern]] = []
    for skill in sorted(SKILLS):
        aliases = [skill] + _SKILL_ALIASES.get(skill, [])
        for alias in aliases:
            if skill == "java" and alias == "java":
                # Do not match "java" inside "java script".
                pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
            else:
                pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
            patterns.append((skill, pat))
    return patterns
 _SKILL_PATTERNS = _build_skill_patterns()
 ROLES = {
    "backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
    "mobile","android","ios","team lead","tech lead","architect"
 }
 _ROLE_ALIASES: Dict[str, List[str]] = {
    "backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
    "frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
    "fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
    "devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
    "qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
    "sre": ["sre", "site reliability"],
    "data engineer": ["data engineer"],
    "data scientist": ["data scientist"],
    "ml engineer": ["ml engineer", "machine learning engineer"],
    "mobile": ["mobile developer", "mobile engineer"],
    "android": ["android developer", "android engineer"],
    "ios": ["ios developer", "ios engineer"],
    "team lead": ["team lead", "teamlead"],
    "tech lead": ["tech lead", "techlead"],
    "architect": ["architect", "solution architect", "software architect"],
 }
 def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
    out: Dict[str, List[re.Pattern]] = {}
    for role in ROLES:
        aliases = _ROLE_ALIASES.get(role, [role])
        out[role] = [
            re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
            for a in aliases
        ]
    return out
 _ROLE_PATTERNS = _build_role_patterns()
 # --- HR / RECRUITER FILTERS ---
 # Words that indicate the line is about searching for candidates, not owning the skill.
 HR_CONTEXT_RE = re.compile(
    r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
    r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
    re.I
 )
 # Roles that explicitly define the person as Non-Engineering
 NON_TECH_ROLES_RE = re.compile(
    r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
    re.I
 )
 # --- EXPERIENCE ---
 AGE_LINE_RE = re.compile(
    r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
 )
 EXP_HEADER_RE = re.compile(
    r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
 )
 # "5 years 10 months"
 EXP_SUMMARY_RE = re.compile(
    r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
    r"[^0-9]{0,20}"
    r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
    r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
 )
 EXP_NEARBY_RE = re.compile(
    r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
    r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
 )
 HH_FOOTER_RE = re.compile(
    r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
    re.I,
 )
 NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
 NAME_LINE_RE = re.compile(
    r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
 )
 NAME_STOPWORDS = {
    "resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
    "projects", "about", "profile", "objective", "навыки", "опыт", "образование",
    "контакты", "профиль", "цель", "резюме",
    "developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
    "backend developer", "frontend developer", "fullstack developer", "software engineer",
    "разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
    "top skills", "experience", "education", "languages", "certifications",
    "skills & endorsements", "endorsements",
    "university", "state university", "institute", "college", "academy", "school",
    "bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
    "колледж", "школа", "бакалавр", "магистр", "факультет",
 }
 _NAME_BAD_WORDS = {
    "skills", "top skills", "experience", "education", "languages", "certifications",
    "projects", "summary", "about", "profile", "endorsements",
    "university", "institute", "college", "academy", "school",
    "bachelor", "master", "degree", "faculty",
 }
 NAME_INSTITUTION_RE = re.compile(
    r"\b("
    r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
    r"mathematics|computer science|informatics|physics|economics|management|"
    r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
    r"математик|информатик|физик|экономик|менеджмент"
    r")\b",
    re.I,
 )
 _EMAIL_PREFIX_STOP = {
    "email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
 }
 def _prune_fragment_emails(values: List[str]) -> List[str]:
    uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
    out: List[str] = []
    for e in uniq:
        local, domain = e.split("@", 1)
        drop = False
        for other in uniq:
            if other == e:
                continue
            ol, od = other.split("@", 1)
            if od != domain:
                continue
            if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
                drop = True
                break
        if not drop:
            out.append(e)
    return out
 def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
    """
    Returns (total_years, engineering_years, confidence, debug).
    Logic:
    1. Calculate TOTAL experience from summaries.
    2. Check if the candidate is primarily a Recruiter/HR.
       - If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
       - If NO: engineering_years = total_years (Optimistic assumption for valid devs).
    """
    dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
    total_years: Optional[float] = None
    confidence = 0.0
    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
    # 1. Detect if Recruiter
    # Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
    header_text = "\n".join(lines[:15])
    is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
    dbg["is_recruiter"] = is_recruiter
    # 2. Extract Total Duration
    if lines:
        # Strategy A: Explicit summary
        for i, ln in enumerate(lines[:200]): 
            if AGE_LINE_RE.search(ln): continue
            # Look for summary line
            if EXP_HEADER_RE.search(ln):
                window = ln
                if i + 1 < len(lines): window += " " + lines[i+1]
                if i + 2 < len(lines): window += " " + lines[i+2]
                m = EXP_SUMMARY_RE.search(window)
                if m:
                    y = int(m.group("y"))
                    mm = int(m.group("m")) if m.group("m") else 0
                    total_years = float(round(y + (mm / 12.0), 2))
                    if 0 <= total_years <= 60:
                        dbg["method"] = "summary"
                        dbg["matched"] = m.group(0)
                        confidence = 0.95
                        break
        # Strategy B: Fallback nearby
        if total_years is None:
            safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
            for i, ln in enumerate(safe_lines):
                if not EXP_HEADER_RE.search(ln): continue
                chunk = " ".join(safe_lines[i : i + 12])
                m = EXP_NEARBY_RE.search(chunk)
                if m:
                    y = int(m.group("y"))
                    mm = int(m.group("m")) if m.group("m") else 0
                    val = float(round(y + (mm / 12.0), 2))
                    if 0 <= val <= 60:
                        total_years = val
                        dbg["method"] = "header_chunk"
                        dbg["matched"] = m.group(0)
                        confidence = 0.80
                        break
    # 2.5 Timeline/range fallback-reconciliation
    # Protects against cases where summary parser catches one short fragment
    # while CV has a long timeline.
    try:
        alt = extract_experience(text or "")
    except Exception:
        alt = None
    if alt and alt.years is not None:
        if total_years is None:
            total_years = alt.years
            confidence = max(confidence, alt.confidence)
            dbg["method"] = "timeline_fallback"
            dbg["matched"] = "date_ranges"
        elif alt.years > (total_years + 1.0):
            strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
            if strong_summary and (alt.years - float(total_years)) > 1.5:
                dbg["reconcile"] = "timeline_skip_strong_summary"
            else:
                total_years = alt.years
                confidence = max(confidence, min(0.82, alt.confidence))
                dbg["method"] = "timeline_reconcile"
                dbg["matched"] = "date_ranges"
    # 3. Calculate Engineering Years
    eng_years = total_years
    if is_recruiter:
        # If they are a recruiter, their "engineering" experience is effectively 0 
        # for the purpose of finding a Developer.
        eng_years = 0.0
    return total_years, eng_years, confidence, dbg
 def _norm_phone(p: str) -> str:
    digits = re.sub(r"\D+", "", p)
    if digits.startswith("8") and len(digits) == 11:
        digits = "7" + digits[1:]
    return "+" + digits if digits else ""
 def _norm_token(s: str) -> str:
    return re.sub(r"\s+", " ", s.strip().lower())
 def safe_json(v) -> str:
    return json.dumps(v, ensure_ascii=False)
 def extract_contacts(text: str) -> Dict[str, List[str]]:
    emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
    for m in EMAIL_SPLIT_RE.finditer(text or ""):
        prefix = m.group("prefix").strip().lower().strip(".-_")
        if not prefix or prefix in _EMAIL_PREFIX_STOP:
            continue
        if not re.search(r"[._\-\d]", prefix):
            continue
        tail = m.group("tail").lower()
        if "@" not in tail:
            continue
        local_tail, domain = tail.split("@", 1)
        local = f"{prefix}{local_tail}"
        if len(local) > 64:
            continue
        cand = f"{local}@{domain}"
        if EMAIL_RE.fullmatch(cand):
            emails_set.add(cand)
    emails = _prune_fragment_emails(sorted(emails_set))
    phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
    tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
    gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
    li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
    urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
    return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
 def extract_name_guess(text: str) -> Optional[str]:
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    if not lines:
        return None
    # 1) HH footer "Name • Резюме обновлено ..."
    m = HH_FOOTER_RE.search(text or "")
    if m:
        cand = m.group("name").strip()
        if _looks_like_name_line(cand):
            return cand
    # 2) Key-value line: "Name: ..." / "Имя: ..."
    for ln in lines[:40]:
        m2 = NAME_KV_RE.match(ln)
        if m2:
            cand = m2.group(2).strip()
            cand = re.split(r"[|,/;]", cand)[0].strip()
            if _looks_like_name_line(cand):
                return cand
    # 3) Name-like in first ~40 lines
    for ln in lines[:40]:
        if _looks_like_heading_line(ln):
            continue
        if _looks_like_name_line(ln):
            return ln
    # 4) Name-like near the end (pptx exports often put name there)
    tail_start = max(0, len(lines) - 60)
    for i in range(tail_start, len(lines)):
        ln = lines[i]
        if _looks_like_heading_line(ln):
            continue
        ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
        if NAME_INSTITUTION_RE.search(ctx):
            continue
        if _looks_like_name_line(ln):
            return ln
    return None
 def _looks_like_heading_line(line: str) -> bool:
    low = (line or "").strip().lower()
    if not low:
        return False
    if low in _NAME_BAD_WORDS:
        return True
    if low.startswith("top skills"):
        return True
    if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
        return True
    return False
 def _looks_like_name_line(line: str) -> bool:
    if not line:
        return False
    if len(line) > 80:
        return False
    low = line.lower().strip()
    if low in NAME_STOPWORDS:
        return False
    if _looks_like_heading_line(line):
        return False
    if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
        return False
    if NAME_INSTITUTION_RE.search(line):
        return False
    if not NAME_LINE_RE.match(line.strip()):
        return False
    return True
 def extract_remote(text: str) -> Optional[bool]:
    if not text:
        return None
    for ln in text.splitlines()[:120]:
        if REMOTE_RE.search(ln):
            return True
    return None
 def extract_english(text: str) -> Optional[str]:
    t = text or ""
    lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
    # 1) CEFR levels anywhere are accepted.
    m = EN_RE.search(t)
    if m:
        return m.group(1).replace("+", "").upper()
    # 2) Textual levels only when English context is present.
    candidate_chunks: List[str] = []
    for i, ln in enumerate(lines):
        if EN_LANG_RE.search(ln):
            candidate_chunks.append(ln)
            if i + 1 < len(lines):
                candidate_chunks.append(lines[i + 1])
    if not candidate_chunks:
        return None
    m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
    if not m2:
        return None
    word = m2.group(1).lower()
    if word in ("native", "fluent", "proficient", "advanced"):
        return "C1"
    if word.startswith("upper"):
        return "B2"
    if word == "intermediate":
        return "B1"
    if word == "elementary":
        return "A2"
    return None
 def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
    """
    Extracts roles and skills, but strictly filters out HR/Recruitment context.
    """
    lines = text.splitlines()
    # 1. Filter text: Remove lines that talk about hiring/vacancies
    clean_lines = []
    for ln in lines:
        if not HR_CONTEXT_RE.search(ln):
            clean_lines.append(ln)
    clean_text = "\n".join(clean_lines).lower()
    # 2. Extract Skills from clean text only
    skills = []
    for s, pat in _SKILL_PATTERNS:
        if pat.search(clean_text):
            skills.append(normalize_skill(s) or s)
    skills = sorted(set(skills))
    # 3. Extract Roles
    # Priority: Header (first 10 lines)
    header_text = "\n".join(lines[:10]).lower()
    found_roles = set()
    # Check if Recruiter
    if NON_TECH_ROLES_RE.search(header_text):
        # If explicit recruiter in header, do NOT add generic tech roles like "backend"
        # even if they appear in the text (often describes who they hire).
        pass
    else:
        # Normal extraction
        for r in ROLES:
            pats = _ROLE_PATTERNS.get(r, [])
            if any(p.search(clean_text) for p in pats):
                # extra guard: devops requires explicit evidence, not just CI/CD mentions
                if r == "devops":
                    if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
                        continue
                found_roles.add(r)
    return sorted(list(found_roles)), skills
 def norm_pipe(tokens: List[str]) -> str:
    toks = [_norm_token(t) for t in tokens if _norm_token(t)]
    uniq = sorted(set(toks))
    return "|" + "|".join(uniq) + "|" if uniq else "|"
 def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
    dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
    if not lines:
        return None, None, 0.0, dbg
    candidates: List[Tuple[int, str, bool, bool]] = []
    for i, ln in enumerate(lines):
        has_hint = SALARY_HINT_RE.search(ln) is not None
        has_pay = PAY_TOKEN_RE.search(ln) is not None
        if not has_hint and not has_pay:
            continue
        if SALARY_NOISE_RE.search(ln) and not has_hint:
            continue
        candidates.append((i, ln, has_hint, has_pay))
    if not candidates:
        return None, None, 0.0, dbg
    has_hint = any(x[2] for x in candidates)
    if not has_hint:
        # Inline pay without "salary" is allowed only near header/contact block.
        candidates = [x for x in candidates if x[0] < 15]
        if not candidates:
            return None, None, 0.0, dbg
    scan_chunks: List[str] = []
    for i, ln, hint, _ in candidates:
        chunk = ln
        if hint and (i + 1) < len(lines):
            chunk = f"{chunk} {lines[i + 1]}"
        scan_chunks.append(chunk)
        dbg["used_lines"].append(ln)
        if hint:
            dbg["hint_lines"] += 1
        dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
    nums: List[int] = []
    for chunk in scan_chunks:
        for m in NUM_RE.finditer(chunk):
            val = None
            if m.group(1) and m.group(2):
                val = int(m.group(1)) * 1000
            elif m.group(3):
                val = int(re.sub(r"\s+", "", m.group(3)))
            elif m.group(4):
                val = int(m.group(4))
            if val and 20_000 <= val <= 30_000_000:
                nums.append(val)
                dbg["numbers"].append(val)
    if not nums:
        return None, None, 0.0, dbg
    nums = sorted(nums)
    salary_min = nums[0]
    salary_max = nums[-1] if len(nums) > 1 else nums[0]
    if dbg["hint_lines"] > 0:
        conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
    else:
        conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
    if salary_max > salary_min * 4:
        conf -= 0.12
    if len(nums) == 1:
        conf -= 0.06
    conf = max(0.0, min(conf, 0.9))
    if conf < 0.45:
        return None, None, conf, dbg
    return salary_min, salary_max, conf, dbg
 def extract_location_best_effort(text: str) -> Optional[str]:
    if not text:
        return None
    def _clean_loc(val: str) -> str:
        return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
    def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
        v = _clean_loc(val)
        if not v or len(v) < 3 or len(v) > 90:
            return False
        if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
            return False
        if SECTION_HEADER_RE.match(v):
            return False
        if LOCATION_CITY_COUNTRY_RE.match(v):
            return True
        if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
            return True
        return False
    patterns = [
        re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
        re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
        re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
    ]
    for p in patterns:
        m = p.search(text)
        if m:
            val = _clean_loc(m.group(2))
            if _is_loc_like(val, allow_single=True):
                return val
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    head: List[str] = []
    for ln in lines[:60]:
        if SECTION_HEADER_RE.match(ln):
            low = ln.lower()
            if low in ("contacts", "contact", "contact info"):
                continue
            break
        head.append(ln)
    for ln in head:
        parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
        for seg in parts:
            if _is_loc_like(seg):
                return _clean_loc(seg)
    return None
--- a/extract/pdf_extract.py
+++ b/extract/pdf_extract.py
@@ -0,0 +1,211 @@
 from __future__ import annotations
 import re
 import shutil
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional, Tuple
 try:  # optional dependency
    from pypdf import PdfReader  # type: ignore
 except Exception:  # pragma: no cover
    try:
        from PyPDF2 import PdfReader  # type: ignore
    except Exception:  # pragma: no cover
        PdfReader = None  # type: ignore
 try:  # optional dependency
    from pdfminer.high_level import extract_text as pdfminer_extract_text  # type: ignore
 except Exception:  # pragma: no cover
    pdfminer_extract_text = None  # type: ignore
@dataclass
 class PdfExtractResult:
    text: str
    pages: List[dict]
    method: str
    score: float
    flags: List[str]
 _SECTION_HINTS = [
    "experience", "work experience", "skills", "education", "projects", "summary", "about",
    "опыт работы", "навыки", "образование", "проекты", "о себе",
 ]
 def _which_pdftotext() -> Optional[str]:
    exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
    return exe
 def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
    exe = _which_pdftotext()
    if not exe:
        return ""
    cmd = [exe]
    if layout:
        cmd.append("-layout")
    cmd += ["-nopgbrk", str(path), "-"]
    try:
        p = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout_sec,
            check=False,
            text=True,
            encoding="utf-8",
            errors="ignore",
        )
        return (p.stdout or "").strip()
    except Exception:
        return ""
 def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
    if PdfReader is None:
        return []
    try:
        reader = PdfReader(str(path), strict=False)
    except Exception:
        return []
    pages: List[dict] = []
    for i, page in enumerate(getattr(reader, "pages", [])):
        if max_pages and i >= max_pages:
            break
        try:
            text = page.extract_text() or ""
        except Exception:
            text = ""
        pages.append({"page": i + 1, "text": text})
    return pages
 def _extract_pdfminer(path: Path) -> str:
    if pdfminer_extract_text is None:
        return ""
    try:
        return (pdfminer_extract_text(str(path)) or "").strip()
    except Exception:
        return ""
 def _quality_score(text: str) -> Tuple[float, List[str]]:
    flags: List[str] = []
    if not text:
        return 0.0, ["empty"]
    total = len(text)
    letters = sum(ch.isalpha() for ch in text)
    spaces = text.count(" ")
    alpha_ratio = letters / max(1, total)
    space_ratio = spaces / max(1, total)
    words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
    avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
    lines = [ln for ln in text.splitlines() if ln.strip()]
    long_lines = [ln for ln in lines if len(ln) > 200]
    long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
    glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
    section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
    score = 0.0
    if alpha_ratio >= 0.45:
        score += 2.0
    elif alpha_ratio >= 0.30:
        score += 1.0
    else:
        flags.append("low_alpha")
    if 0.10 <= space_ratio <= 0.28:
        score += 1.0
    else:
        flags.append("odd_spacing")
    if 3.5 <= avg_word_len <= 9.0:
        score += 1.0
    else:
        flags.append("odd_word_len")
    if long_line_ratio <= 0.06:
        score += 1.0
    else:
        flags.append("long_lines")
    if glued_hits <= 6:
        score += 1.0
    else:
        flags.append("glued_text")
    if section_hits >= 2:
        score += 1.0
    elif section_hits == 1:
        score += 0.5
    if total < 200:
        flags.append("short_text")
    if alpha_ratio < 0.08 or total < 120:
        flags.append("scan_like")
    return score, flags
 def deglue_text(text: str) -> str:
    if not text:
        return text
    t = text
    t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
    t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
    t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
    t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
    return t
 def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
    candidates: List[Tuple[str, str]] = []
    txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
    if txt_layout:
        candidates.append(("pdftotext_layout", txt_layout))
    txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
    if txt_plain:
        candidates.append(("pdftotext_plain", txt_plain))
    txt_pypdf = ""
    if PdfReader is not None:
        pages = _extract_pages_pypdf(path)
        if pages:
            txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
    if txt_pypdf:
        candidates.append(("pypdf", txt_pypdf))
    txt_pdfminer = _extract_pdfminer(path)
    if txt_pdfminer:
        candidates.append(("pdfminer", txt_pdfminer))
    if not candidates:
        return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
    best_method = "none"
    best_text = ""
    best_score = -1.0
    best_flags: List[str] = []
    for method, text in candidates:
        score, flags = _quality_score(text)
        if score > best_score:
            best_score = score
            best_method = method
            best_text = text
            best_flags = flags
    pages = _extract_pages_pypdf(path)
    best_text = deglue_text(best_text)
    return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)
--- a/extract/sections.py
+++ b/extract/sections.py
@@ -0,0 +1,70 @@
 from __future__ import annotations
 import re
 from typing import Dict, List, Optional, Tuple
 _SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
    "contacts": [
        re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
    ],
    "about": [
        re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
    ],
    "skills": [
        re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
    ],
    "experience": [
        re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
    ],
    "education": [
        re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
    ],
    "projects": [
        re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
    ],
    "languages": [
        re.compile(r"^\s*(languages?|языки)\s*$", re.I),
    ],
    "certifications": [
        re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
    ],
    "publications": [
        re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
    ],
 }
 def _match_header(line: str) -> Optional[str]:
    for key, patterns in _SECTION_PATTERNS.items():
        for rx in patterns:
            if rx.match(line):
                return key
    return None
 def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
    lines = [ln.strip() for ln in (clean_text or "").splitlines()]
    sections: Dict[str, List[str]] = {"header": []}
    current = "header"
    for ln in lines:
        if not ln:
            continue
        key = _match_header(ln)
        if key:
            current = key
            sections.setdefault(current, [])
            continue
        sections.setdefault(current, []).append(ln)
    out: Dict[str, str] = {}
    for k, vals in sections.items():
        text = "\n".join(vals).strip()
        if text:
            out[k] = text
    return out
 def sections_present(sections: Dict[str, str]) -> List[str]:
    return sorted([k for k, v in (sections or {}).items() if v and k != "header"])
--- a/extract/templates/init.py
+++ b/extract/templates/init.py
@@ -0,0 +1 @@
 __all__ = []
--- a/extract/templates/generic.py
+++ b/extract/templates/generic.py
@@ -0,0 +1,46 @@
 from __future__ import annotations
 from typing import Any, Dict
 from tg_resume_db.extract.parse import (
    extract_contacts,
    extract_name_guess,
    extract_remote,
    extract_english,
    extract_roles_skills,
    extract_salary,
    extract_location_best_effort,
    extract_experience_years,
 )
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    text = clean_text or ""
    contacts_raw = extract_contacts(text)
    name = extract_name_guess(text)
    remote = extract_remote(text)
    english = extract_english(text)
    roles, skills = extract_roles_skills(text)
    location = extract_location_best_effort(text)
    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
    return {
        "name": name,
        "contacts_raw": contacts_raw,
        "remote": remote,
        "english": english,
        "roles": roles,
        "skills": skills,
        "location": location,
        "exp_years": exp_years,
        "exp_years_eng": exp_years_eng,
        "exp_conf": exp_conf,
        "exp_dbg": exp_dbg,
        "salary_min": sal_min,
        "salary_max": sal_max,
        "salary_conf": sal_conf,
        "salary_dbg": sal_dbg,
        "parse_method": "generic_heur",
    }
--- a/extract/templates/hh.py
+++ b/extract/templates/hh.py
@@ -0,0 +1,58 @@
 from __future__ import annotations
 from typing import Any, Dict
 from tg_resume_db.extract.parse import (
    extract_contacts,
    extract_name_guess,
    extract_remote,
    extract_english,
    extract_roles_skills,
    extract_salary,
    extract_location_best_effort,
    extract_experience_years,
 )
 def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
    if not sections:
        return fallback
    return sections.get(key) or fallback
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    header_text = _pick(sections, "header", clean_text)
    contacts_text = _pick(sections, "contacts", clean_text)
    about_text = _pick(sections, "about", clean_text)
    skills_text = _pick(sections, "skills", clean_text)
    exp_text = _pick(sections, "experience", clean_text)
    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
    name = extract_name_guess(header_text)
    contacts_raw = extract_contacts(contacts_text)
    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
    remote = extract_remote(clean_text)
    english = extract_english(clean_text)
    location = extract_location_best_effort(clean_text)
    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
    return {
        "name": name,
        "contacts_raw": contacts_raw,
        "remote": remote,
        "english": english,
        "roles": roles,
        "skills": skills,
        "location": location,
        "exp_years": exp_years,
        "exp_years_eng": exp_years_eng,
        "exp_conf": exp_conf,
        "exp_dbg": exp_dbg,
        "salary_min": sal_min,
        "salary_max": sal_max,
        "salary_conf": sal_conf,
        "salary_dbg": sal_dbg,
        "parse_method": "hh_template",
    }
--- a/extract/templates/hh_ru.py
+++ b/extract/templates/hh_ru.py
@@ -0,0 +1,85 @@
 from __future__ import annotations
 import re
 from typing import Any, Dict, Optional
 from tg_resume_db.extract.parse import (
    extract_contacts,
    extract_name_guess,
    extract_remote,
    extract_english,
    extract_roles_skills,
    extract_salary,
    extract_location_best_effort,
    extract_experience_years,
 )
 _DESIRED_RE = re.compile(r"(?i)жел[а-я]*\s+должност[ьи]\s*[:\-]?\s*(.+)")
 _SPEC_RE = re.compile(r"(?i)специализаци[яи]\s*[:\-]?\s*(.+)")
 _SCHEDULE_RE = re.compile(r"(?i)график\s+работы\s*[:\-]?\s*(.+)")
 _EMPLOYMENT_RE = re.compile(r"(?i)занятость\s*[:\-]?\s*(.+)")
 def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
    if not sections:
        return fallback
    return sections.get(key) or fallback
 def _find_first(regex: re.Pattern, text: str) -> Optional[str]:
    for ln in text.splitlines():
        m = regex.search(ln)
        if m:
            val = m.group(1).strip()
            val = re.split(r"[|;/]", val)[0].strip()
            if 2 <= len(val) <= 80:
                return val
    return None
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    header_text = _pick(sections, "header", clean_text)
    contacts_text = _pick(sections, "contacts", clean_text)
    about_text = _pick(sections, "about", clean_text)
    skills_text = _pick(sections, "skills", clean_text)
    exp_text = _pick(sections, "experience", clean_text)
    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
    name = extract_name_guess(header_text)
    contacts_raw = extract_contacts(contacts_text)
    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
    remote = extract_remote(clean_text)
    english = extract_english(clean_text)
    location = extract_location_best_effort(clean_text)
    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
    desired_title = _find_first(_DESIRED_RE, clean_text)
    specializations = _find_first(_SPEC_RE, clean_text)
    schedule = _find_first(_SCHEDULE_RE, clean_text)
    employment = _find_first(_EMPLOYMENT_RE, clean_text)
    return {
        "name": name,
        "contacts_raw": contacts_raw,
        "remote": remote,
        "english": english,
        "roles": roles,
        "skills": skills,
        "location": location,
        "exp_years": exp_years,
        "exp_years_eng": exp_years_eng,
        "exp_conf": exp_conf,
        "exp_dbg": exp_dbg,
        "salary_min": sal_min,
        "salary_max": sal_max,
        "salary_conf": sal_conf,
        "salary_dbg": sal_dbg,
        "desired_title": desired_title,
        "specializations": specializations,
        "employment_type": employment,
        "schedule": schedule,
        "parse_method": "hh_template",
    }
--- a/extract/templates/linkedin.py
+++ b/extract/templates/linkedin.py
@@ -0,0 +1,57 @@
 from __future__ import annotations
 from typing import Any, Dict
 from tg_resume_db.extract.parse import (
    extract_contacts,
    extract_name_guess,
    extract_remote,
    extract_english,
    extract_roles_skills,
    extract_salary,
    extract_location_best_effort,
    extract_experience_years,
 )
 def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
    if not sections:
        return fallback
    return sections.get(key) or fallback
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    header_text = _pick(sections, "header", clean_text)
    about_text = _pick(sections, "about", clean_text)
    skills_text = _pick(sections, "skills", clean_text)
    exp_text = _pick(sections, "experience", clean_text)
    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
    name = extract_name_guess(header_text)
    contacts_raw = extract_contacts(clean_text)
    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
    remote = extract_remote(clean_text)
    english = extract_english(clean_text)
    location = extract_location_best_effort(clean_text)
    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
    return {
        "name": name,
        "contacts_raw": contacts_raw,
        "remote": remote,
        "english": english,
        "roles": roles,
        "skills": skills,
        "location": location,
        "exp_years": exp_years,
        "exp_years_eng": exp_years_eng,
        "exp_conf": exp_conf,
        "exp_dbg": exp_dbg,
        "salary_min": sal_min,
        "salary_max": sal_max,
        "salary_conf": sal_conf,
        "salary_dbg": sal_dbg,
        "parse_method": "linkedin_template",
    }
--- a/extract/templates/one_page.py
+++ b/extract/templates/one_page.py
@@ -0,0 +1,46 @@
 from __future__ import annotations
 from typing import Any, Dict
 from tg_resume_db.extract.parse import (
    extract_contacts,
    extract_name_guess,
    extract_remote,
    extract_english,
    extract_roles_skills,
    extract_salary,
    extract_location_best_effort,
    extract_experience_years,
 )
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    text = clean_text or ""
    contacts_raw = extract_contacts(text)
    name = extract_name_guess(text)
    roles, skills = extract_roles_skills(text)
    remote = extract_remote(text)
    english = extract_english(text)
    location = extract_location_best_effort(text)
    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
    return {
        "name": name,
        "contacts_raw": contacts_raw,
        "remote": remote,
        "english": english,
        "roles": roles,
        "skills": skills,
        "location": location,
        "exp_years": exp_years,
        "exp_years_eng": exp_years_eng,
        "exp_conf": exp_conf,
        "exp_dbg": exp_dbg,
        "salary_min": sal_min,
        "salary_max": sal_max,
        "salary_conf": sal_conf,
        "salary_dbg": sal_dbg,
        "parse_method": "one_page_template",
    }
--- a/extract/templates/one_page_en.py
+++ b/extract/templates/one_page_en.py
@@ -0,0 +1,11 @@
 from __future__ import annotations
 from typing import Any, Dict
 from tg_resume_db.extract.templates.one_page import parse_resume as _parse
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    out = _parse(clean_text, sections)
    out["parse_method"] = "one_page_en"
    return out
--- a/extract/templates/one_page_ru.py
+++ b/extract/templates/one_page_ru.py
@@ -0,0 +1,11 @@
 from __future__ import annotations
 from typing import Any, Dict
 from tg_resume_db.extract.templates.one_page import parse_resume as _parse
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    out = _parse(clean_text, sections)
    out["parse_method"] = "one_page_ru"
    return out
--- a/extract/templates/pptx_export.py
+++ b/extract/templates/pptx_export.py
@@ -0,0 +1,45 @@
 from __future__ import annotations
 from typing import Any, Dict
 from tg_resume_db.extract.parse import (
    extract_contacts,
    extract_name_guess,
    extract_remote,
    extract_english,
    extract_roles_skills,
    extract_salary,
    extract_location_best_effort,
    extract_experience_years,
 )
 def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
    text = clean_text or ""
    contacts_raw = extract_contacts(text)
    name = extract_name_guess(text)
    roles, skills = extract_roles_skills(text)
    remote = extract_remote(text)
    english = extract_english(text)
    location = extract_location_best_effort(text)
    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
    return {
        "name": name,
        "contacts_raw": contacts_raw,
        "remote": remote,
        "english": english,
        "roles": roles,
        "skills": skills,
        "location": location,
        "exp_years": exp_years,
        "exp_years_eng": exp_years_eng,
        "exp_conf": exp_conf,
        "exp_dbg": exp_dbg,
        "salary_min": sal_min,
        "salary_max": sal_max,
        "salary_conf": sal_conf,
        "salary_dbg": sal_dbg,
        "parse_method": "pptx_template",
    }
--- a/extract/text_extract.py
+++ b/extract/text_extract.py
@@ -0,0 +1,99 @@
 from __future__ import annotations
 import os
 from pathlib import Path
 import logging
 from bs4 import BeautifulSoup
 try:  # optional dependency for PDF fallback
    from pypdf import PdfReader as _PdfReader  # type: ignore
 except Exception:  # pragma: no cover - optional import
    try:
        from PyPDF2 import PdfReader as _PdfReader  # type: ignore
    except Exception:  # pragma: no cover
        _PdfReader = None  # type: ignore
 def _read_bytes(path: Path) -> bytes:
    return path.read_bytes()
 def extract_text_from_txt(path: Path) -> str:
    data = _read_bytes(path)
    for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
        try:
            return data.decode(enc, errors="ignore")
        except Exception:
            continue
    return data.decode("utf-8", errors="ignore")
 def extract_text_from_html(path: Path) -> str:
    html = extract_text_from_txt(path)
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text("\n", strip=True)
 def extract_text_from_docx(path: Path) -> str:
    from docx import Document
    doc = Document(str(path))
    parts = []
    for p in doc.paragraphs:
        if p.text and p.text.strip():
            parts.append(p.text.strip())
    for table in doc.tables:
        for row in table.rows:
            cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
            if cells:
                parts.append(" | ".join(cells))
    return "\n".join(parts)
 _PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
 # Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
 logging.getLogger("pypdf").setLevel(logging.ERROR)
 logging.getLogger("PyPDF2").setLevel(logging.ERROR)
 def extract_text_from_pdf(path: Path) -> str:
    """
    Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
    Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
    """
    if _PdfReader is None:
        raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
    try:
        reader = _PdfReader(str(path), strict=False)
    except Exception as exc:  # pragma: no cover - pdf parser edge cases
        raise RuntimeError(f"PDF read failed: {exc}") from exc
    parts = []
    for idx, page in enumerate(getattr(reader, "pages", [])):
        if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
            break
        try:
            text = page.extract_text()  # type: ignore[attr-defined]
        except Exception:
            text = None
        if text:
            parts.append(text)
    return "\n".join(parts)
 def extract_text_from_doc_best_effort(path: Path) -> str:
    # .doc requires external tools; best-effort if textract installed
    try:
        import textract  # type: ignore
        b = textract.process(str(path))
        return b.decode("utf-8", errors="ignore")
    except Exception:
        return ""
 def extract_text(path: Path) -> str:
    ext = path.suffix.lower()
    if ext in (".txt", ".log"):
        return extract_text_from_txt(path)
    if ext in (".html", ".htm"):
        return extract_text_from_html(path)
    if ext == ".docx":
        return extract_text_from_docx(path)
    if ext == ".pdf":
        return extract_text_from_pdf(path)
    if ext == ".doc":
        return extract_text_from_doc_best_effort(path)
    return ""
--- a/importers/file_scan.py
+++ b/importers/file_scan.py
@@ -0,0 +1,21 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterator
 RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
 def iter_files(root: Path) -> Iterator[Dict]:
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in RESUME_EXTS:
            yield {
                "origin_type": "file_scan",
                "export_path": str(root),
                "chat_title": None,
                "message_id": None,
                "message_date": None,
                "message_text": "",
                "file_path": str(p.resolve()),
                "original_name": p.name,
                "extra": {},
            }
--- a/importers/telegram_html.py
+++ b/importers/telegram_html.py
@@ -0,0 +1,66 @@
 from __future__ import annotations
 import re
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional
 from bs4 import BeautifulSoup
 RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
 def find_messages_html(root: Path) -> List[Path]:
    return [p for p in root.rglob("messages*.html") if p.is_file()]
 def iter_artifacts(messages_html: Path) -> Iterator[Dict]:
    html = messages_html.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "lxml")
    chat_title = None
    h = soup.find(class_=re.compile(r"page_header", re.I))
    if h:
        chat_title = h.get_text(" ", strip=True)
    chat_title = chat_title or messages_html.parent.name
    for msg in soup.select(".message.default.clearfix, .message"):
        message_id = msg.get("id") or None
        date_div = msg.select_one(".date")
        msg_date = date_div.get("title") if date_div else None
        text_div = msg.select_one(".text")
        msg_text = text_div.get_text("\n", strip=True) if text_div else ""
        file_path = None
        original_name = None
        for a in msg.find_all("a", href=True):
            href = a["href"]
            p = (messages_html.parent / href).resolve()
            if p.exists() and p.suffix.lower() in RESUME_EXTS:
                file_path = str(p)
                original_name = p.name
                break
        if file_path:
            yield {
                "origin_type": "telegram_html",
                "export_path": str(messages_html.parent),
                "chat_title": chat_title,
                "message_id": str(message_id) if message_id else None,
                "message_date": msg_date,
                "message_text": msg_text or "",
                "file_path": file_path,
                "original_name": original_name,
                "extra": {"html_path": str(messages_html)},
            }
        else:
            if msg_text and len(msg_text.strip()) >= 500:
                yield {
                    "origin_type": "message_text",
                    "export_path": str(messages_html.parent),
                    "chat_title": chat_title,
                    "message_id": str(message_id) if message_id else None,
                    "message_date": msg_date,
                    "message_text": msg_text,
                    "file_path": None,
                    "original_name": None,
                    "extra": {"html_path": str(messages_html)},
                }
--- a/importers/telegram_json.py
+++ b/importers/telegram_json.py
@@ -0,0 +1,73 @@
 from __future__ import annotations
 import json
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional
 RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
 def find_result_json(root: Path) -> List[Path]:
    return list(root.rglob("result.json"))
 def _text_field_to_str(text_field) -> str:
    if isinstance(text_field, str):
        return text_field
    if isinstance(text_field, list):
        parts = []
        for item in text_field:
            if isinstance(item, str):
                parts.append(item)
            elif isinstance(item, dict) and "text" in item:
                parts.append(str(item["text"]))
        return "".join(parts)
    return ""
 def iter_artifacts(result_json: Path) -> Iterator[Dict]:
    data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore"))
    chats = []
    if isinstance(data, dict):
        chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or []
    for chat in chats:
        chat_title = chat.get("name") or chat.get("title") or "unknown_chat"
        messages = chat.get("messages", []) or []
        for msg in messages:
            msg_id = str(msg.get("id") or "")
            msg_date = msg.get("date") or msg.get("date_unixtime") or None
            text = _text_field_to_str(msg.get("text", ""))
            file_rel = msg.get("file") or None
            file_path = None
            original_name = None
            if file_rel:
                p = (result_json.parent / file_rel).resolve()
                if p.exists() and p.suffix.lower() in RESUME_EXTS:
                    file_path = str(p)
                    original_name = p.name
            if file_path:
                yield {
                    "origin_type": "telegram_json",
                    "export_path": str(result_json.parent),
                    "chat_title": chat_title,
                    "message_id": msg_id,
                    "message_date": str(msg_date) if msg_date is not None else None,
                    "message_text": text or "",
                    "file_path": file_path,
                    "original_name": original_name,
                    "extra": {"json_path": str(result_json)},
                }
            else:
                # message-only resume paste (heuristic)
                if text and len(text.strip()) >= 500:
                    yield {
                        "origin_type": "message_text",
                        "export_path": str(result_json.parent),
                        "chat_title": chat_title,
                        "message_id": msg_id,
                        "message_date": str(msg_date) if msg_date is not None else None,
                        "message_text": text,
                        "file_path": None,
                        "original_name": None,
                        "extra": {"json_path": str(result_json)},
                    }
--- a/normalize.py
+++ b/normalize.py
@@ -0,0 +1,174 @@
 from __future__ import annotations
 import re
 from typing import Dict, List, Optional, Tuple
 _SKILL_SYNONYMS: Dict[str, List[str]] = {
    "python": ["py"],
    "javascript": ["js", "node", "nodejs", "java script", "java-script"],
    "typescript": ["ts", "type script", "type-script"],
    "postgresql": ["postgres", "psql"],
    "kubernetes": ["k8s"],
    "docker": [],
    "fastapi": [],
    "django": ["drf", "django rest framework"],
    "flask": [],
    "golang": ["go"],
    "c++": ["cpp"],
    "c#": ["csharp"],
    "redis": [],
    "kafka": [],
    "rabbitmq": [],
    "grpc": [],
    "rest": [],
 }
 _SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
 _ROLE_SYNONYMS: Dict[str, List[str]] = {
    "backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
    "frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
    "fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
    "devops": ["sre", "site reliability"],
    "qa": ["tester", "тестировщик"],
    "data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
    "mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
 }
 def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
    alias = {}
    for canonical, al in src.items():
        alias[canonical] = canonical
        for a in al:
            alias[a] = canonical
    return {k.lower(): v for k, v in alias.items()}
 _SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
 _ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
 def _normalize_skill_surface(token: str) -> str:
    t = (token or "").strip().lower()
    if not t:
        return ""
    t = t.replace("/", " ")
    t = re.sub(r"[_\-]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    # "java script", "type script", "postgre sql", "graph ql", "g rpc"
    t = re.sub(r"\bjava\s+script\b", "javascript", t)
    t = re.sub(r"\btype\s+script\b", "typescript", t)
    t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
    t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
    t = re.sub(r"\bg\s+rpc\b", "grpc", t)
    t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
    return t
 def normalize_skill(token: str) -> Optional[str]:
    t = _normalize_skill_surface(token)
    if not t:
        return None
    # Avoid false-positive java from "javascript"
    if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
        return "javascript"
    return _SKILL_ALIAS.get(t, t)
 def normalize_skills(skills: List[str]) -> List[str]:
    out: List[str] = []
    seen = set()
    for s in skills or []:
        canon = normalize_skill(s)
        if not canon or canon in seen:
            continue
        seen.add(canon)
        out.append(canon)
    return out
 def normalize_role(token: str) -> Optional[str]:
    t = (token or "").strip().lower()
    if not t:
        return None
    return _ROLE_ALIAS.get(t, t)
 def normalize_roles(roles: List[str]) -> List[str]:
    out: List[str] = []
    seen = set()
    for r in roles or []:
        canon = normalize_role(r)
        if not canon or canon in seen:
            continue
        seen.add(canon)
        out.append(canon)
    return out
 def split_skills_primary_secondary(
    skills: List[str],
    *,
    clean_text: str,
    sections: Dict[str, str] | None = None,
    primary_limit: int = 25,
 ) -> Tuple[List[str], List[str]]:
    if not skills:
        return [], []
    text = (clean_text or "").lower()
    skills_section = (sections or {}).get("skills", "").lower()
    experience_section = (sections or {}).get("experience", "").lower()
    scores: Dict[str, float] = {}
    for sk in skills:
        s = sk.lower()
        score = 1.0
        if s in skills_section:
            score += 2.2
        if s in experience_section:
            score += 1.2
        count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
        score += min(2.5, count * 0.5)
        if s in _SKILL_STOP:
            score -= 1.5
        scores[sk] = score
    ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
    primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
    secondary = [s for s in ranked if s not in primary]
    return primary, secondary
 def normalize_location(raw: Optional[str]) -> Optional[str]:
    if not raw:
        return None
    t = raw.strip()
    low = t.lower()
    if low in ("москва", "moscow", "moscow, russia"):
        return "Moscow, Russia"
    if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
        return "Saint Petersburg, Russia"
    return t
 def find_skills_in_text(text: str) -> List[str]:
    if not text:
        return []
    found: List[str] = []
    seen = set()
    low = _normalize_skill_surface(text)
    for alias, canon in _SKILL_ALIAS.items():
        key = _normalize_skill_surface(alias)
        if key in seen:
            continue
        if re.search(r"\b" + re.escape(key) + r"\b", low):
            if canon not in seen:
                found.append(canon)
                seen.add(canon)
    return found
--- a/pdf_merge.py
+++ b/pdf_merge.py
@@ -0,0 +1,45 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Iterable, List, Optional
 from pypdf import PdfReader, PdfWriter
 def merge_pdfs(pdf_paths: Iterable[str | Path], out_pdf_path: str | Path) -> dict:
    out_pdf_path = Path(out_pdf_path)
    out_pdf_path.parent.mkdir(parents=True, exist_ok=True)
    writer = PdfWriter()
    merged: List[str] = []
    skipped: List[str] = []
    for p in pdf_paths:
        path = Path(p)
        try:
            reader = PdfReader(str(path))
            # просто добавляем страницы подряд
            for page in reader.pages:
                writer.add_page(page)
            merged.append(str(path))
        except Exception:
            skipped.append(str(path))
    if merged:
        with out_pdf_path.open("wb") as f:
            writer.write(f)
    return {
        "out_pdf": str(out_pdf_path),
        "merged_count": len(merged),
        "skipped_count": len(skipped),
        "merged_files": merged,
        "skipped_files": skipped,
    }
 def merge_all_pdfs_in_dir(files_dir: str | Path, out_pdf_path: str | Path) -> dict:
    files_dir = Path(files_dir)
    pdfs = sorted(files_dir.rglob("*.pdf")) + sorted(files_dir.rglob("*.PDF"))
    return merge_pdfs(pdfs, out_pdf_path)
--- a/pipeline.py
+++ b/pipeline.py
--- a/search.py
+++ b/search.py
@@ -0,0 +1,393 @@
 from __future__ import annotations
 import json
 import re
 import sqlite3
 from typing import Any, Dict, List, Tuple
 from tg_resume_db.normalize import normalize_skill, find_skills_in_text
 # -----------------------------
 # Normalization helpers
 # -----------------------------
 def _norm_token(v: str) -> str:
    return " ".join(str(v).strip().lower().split())
 def _as_list(v: Any) -> List[str]:
    """
    Accepts:
      - None
      - list
      - "a,b,c" (csv string)
    """
    if v is None:
        return []
    if isinstance(v, list):
        return [str(x) for x in v if str(x).strip()]
    s = str(v).strip()
    if not s:
        return []
    return [x.strip() for x in s.split(",") if x.strip()]
 def _uniq_keep_order(xs: List[str]) -> List[str]:
    seen = set()
    out: List[str] = []
    for x in xs:
        t = _norm_token(x)
        if not t or t in seen:
            continue
        seen.add(t)
        out.append(t)
    return out
 # -----------------------------
 # Pipe-normalized columns filters
 # skills_norm / roles_norm like: "|python|fastapi|"
 # -----------------------------
 def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
    vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
    if not vals:
        return ("1=1", [])
    parts: List[str] = []
    args: List[Any] = []
    for v in vals:
        parts.append(f"instr({field}, ?) > 0")
        args.append(f"|{v}|")
    return "(" + " OR ".join(parts) + ")", args
 def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
    vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
    if not vals:
        return ("1=1", [])
    parts: List[str] = []
    args: List[Any] = []
    for v in vals:
        parts.append(f"instr({field}, ?) > 0")
        args.append(f"|{v}|")
    return "(" + " AND ".join(parts) + ")", args
 # -----------------------------
 # FTS5 sanitizer (fixes comma/garbage breaking MATCH)
 # -----------------------------
 # allow longer queries (списки имён, длинные промпты) без агрессивного усечения
 _FTS_MAX_TERMS = 48
 def _fts_safe_query(q: str) -> str:
    """
    Turn a free-form recruiter text into a safe FTS5 MATCH expression.
    We intentionally DO NOT allow raw FTS syntax from user input,
    because it easily breaks on commas/quotes/etc.
    Example:
      "Backend developer, опыт 5+ лет, Java C++ Python" ->
      "\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\""
    """
    if not q:
        return "resume"
    s = q.strip().lower()
    # normalize common tokens
    s = s.replace("c++", "cpp")
    s = s.replace("c#", "csharp")
    s = s.replace(".net", "dotnet")
    # remove punctuation that breaks MATCH
    s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    # tokens (latin/cyrillic + digits + a few chars)
    terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I)
    terms = terms[:_FTS_MAX_TERMS]
    if not terms:
        return "resume"
    # quote every term => safe; join with OR => broad query
    return " OR ".join([f"\"{t}\"" for t in terms])
 def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]:
    """
    Extract +must and -exclude skills from query; return (must, exclude, cleaned_query).
    """
    if not q:
        return [], [], ""
    must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q)
    excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q)
    must = []
    exclude = []
    for t in must_raw:
        canon = normalize_skill(t)
        if canon:
            must.append(canon)
    for t in excl_raw:
        canon = normalize_skill(t)
        if canon:
            exclude.append(canon)
    if " and " in q.lower() or " & " in q:
        must += find_skills_in_text(q)
    cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned
 # -----------------------------
 # Contacts
 # -----------------------------
 def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]:
    rows = con.execute(
        "SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?",
        (candidate_id,),
    ).fetchall()
    m: Dict[str, List[str]] = {}
    for r in rows:
        m.setdefault(r["contact_type"], []).append(r["contact_value"])
    # чуть чище: уберём дубль-контакты
    for k, vals in list(m.items()):
        m[k] = _uniq_keep_order(vals)
    return m
 # -----------------------------
 # Main search (FTS + filters)
 # -----------------------------
 def search(
    con: sqlite3.Connection,
    query: str,
    filters: Dict[str, Any],
    limit: int = 20,
    offset: int = 0,
 ) -> List[Dict[str, Any]]:
    """
    Search candidates using:
      - FTS5 for ranking/snippet
      - stack filters for skills/roles via pipe-normalized columns
      - basic filters: remote/location/experience/salary/english
    """
    where: List[str] = ["r.is_active = 1"]
    params: List[Any] = []
    must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "")
    # -------- basic filters --------
    if filters.get("remote") is not None:
        where.append("c.remote = ?")
        params.append(1 if bool(filters["remote"]) else 0)
    if filters.get("location"):
        where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?")
        params.append("%" + str(filters["location"]).lower() + "%")
    # Используем experience_years для SQL-фильтрации (широкий поиск),
    # а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py
    if filters.get("experience_min") is not None:
        where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?")
        params.append(float(filters["experience_min"]))
    # Salary: "unknown salary doesn't exclude"
    if filters.get("salary_min") is not None:
        where.append("(c.salary_max IS NULL OR c.salary_max >= ?)")
        params.append(int(filters["salary_min"]))
    if filters.get("salary_max") is not None:
        where.append("(c.salary_min IS NULL OR c.salary_min <= ?)")
        params.append(int(filters["salary_max"]))
    if filters.get("doc_type"):
        where.append("r.doc_type = ?")
        params.append(str(filters["doc_type"]))
    # English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py
    # -------- roles/skills stack filters --------
    # backward compatibility
    skills_any: List[str] = []
    skills_all: List[str] = []
    roles_any: List[str] = []
    if filters.get("skill"):
        skills_any.append(str(filters["skill"]))
    if filters.get("role"):
        roles_any.append(str(filters["role"]))
    skills_any += _as_list(filters.get("skills_any"))
    skills_all += _as_list(filters.get("skills_all"))
    roles_any += _as_list(filters.get("roles_any"))
    skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any])
    skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all])
    roles_any = _uniq_keep_order(roles_any)
    if must_skills:
        skills_all = _uniq_keep_order(skills_all + must_skills)
    # Denis rule: if any skills were provided -> enforce ANY match
    if skills_any:
        clause, args = _pipe_any_clause("c.skills_norm", skills_any)
        where.append(clause)
        params.extend(args)
    if skills_all:
        clause, args = _pipe_all_clause("c.skills_norm", skills_all)
        where.append(clause)
        params.extend(args)
    if roles_any:
        clause, args = _pipe_any_clause("c.roles_norm", roles_any)
        where.append(clause)
        params.extend(args)
    if exclude_skills:
        for sk in exclude_skills:
            where.append("instr(c.skills_norm, ?) = 0")
            params.append(f"|{sk}|")
    # -------- FTS query (SAFE) --------
    fts_q = _fts_safe_query(cleaned_query or "")
    limit = max(1, min(int(limit or 20), 100))
    offset = max(0, int(offset or 0))
    # UPDATED SQL: Added experience_years_eng and language/backend metadata
    sql = f"""
    SELECT
      c.candidate_id,
      c.name,
      c.location,
      c.remote,
      c.experience_years,
      c.experience_years_eng,
      c.experience_confidence,
      c.salary_min,
      c.salary_max,
      c.salary_confidence,
      c.english_level,
      c.roles_json,
      c.skills_json,
      c.primary_languages_json,
      c.backend_focus,
      r.doc_type,
      r.doc_type_confidence,
      r.parse_method,
      r.resume_id,
      snippet(resumes_fts, 2, '[', ']', '…', 14) AS snippet,
      bm25(resumes_fts) AS rank
    FROM resumes_fts
    JOIN resumes r ON r.resume_id = resumes_fts.resume_id
    JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id
    WHERE resumes_fts MATCH ? AND {" AND ".join(where)}
    ORDER BY rank
    LIMIT ? OFFSET ?
    """
    rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall()
    out: List[Dict[str, Any]] = []
    for row in rows:
        cand_id = row["candidate_id"]
        contacts_map = _fetch_contacts_map(con, cand_id)
        out.append(
            {
                "candidate_id": cand_id,
                "name": row["name"],
                "location": row["location"],
                "remote": bool(row["remote"]) if row["remote"] is not None else None,
                "experience_years": row["experience_years"],
                "experience_years_eng": row["experience_years_eng"], # Passed to agent
                "experience_confidence": row["experience_confidence"],
                "salary_min": row["salary_min"],
                "salary_max": row["salary_max"],
                "salary_confidence": row["salary_confidence"],
                "english_level": row["english_level"],
                "roles": json.loads(row["roles_json"] or "[]"),
                "skills": json.loads(row["skills_json"] or "[]"),
                "primary_languages": json.loads(row["primary_languages_json"] or "[]"),
                "backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None),
                "doc_type": row["doc_type"],
                "doc_type_confidence": row["doc_type_confidence"],
                "parse_method": row["parse_method"],
                "contacts": contacts_map,
                "resume_id": row["resume_id"],
                "snippet": row["snippet"],
                "rank": row["rank"],
            }
        )
    return out
 # -----------------------------
 # Agent helper (SearchPlan -> search())
 # -----------------------------
 def _join_csv(xs: List[str]) -> str:
    xs = [str(x).strip() for x in (xs or []) if str(x).strip()]
    return ",".join(xs)
 def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]:
    """
    Wrapper for agent.py.
    Expects `plan` with fields:
      query_text, skills_any, skills_all, roles_any, location, remote,
      english_min, exp_years_min, salary_min, salary_max, limit, sort
    Returns:
      { "items": [...], "count": N }
    """
    filters = {
        "remote": getattr(plan, "remote", None),
        "location": getattr(plan, "location", None),
        "experience_min": getattr(plan, "exp_years_min", None),
        "salary_min": getattr(plan, "salary_min", None),
        "salary_max": getattr(plan, "salary_max", None),
        "english": getattr(plan, "english_min", None),
        "roles_any": _join_csv(getattr(plan, "roles_any", []) or []),
        "skills_any": _join_csv(getattr(plan, "skills_any", []) or []),
        "skills_all": _join_csv(getattr(plan, "skills_all", []) or []),
    }
    items = search(
        con,
        query=(getattr(plan, "query_text", "") or "").strip(),
        filters=filters,
        limit=int(getattr(plan, "limit", 20) or 20),
        offset=0,
    )
    sort_mode = (getattr(plan, "sort", "rank") or "rank").strip()
    if sort_mode == "exp_desc":
        def k(it: Dict[str, Any]):
            v = it.get("experience_years")
            return (v is None, -(v or 0.0))
        items = sorted(items, key=k)
    elif sort_mode == "salary_desc":
        def k(it: Dict[str, Any]):
            v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min")
            return (v is None, -(v or 0))
        items = sorted(items, key=k)
    return {"items": items, "count": len(items)}
--- a/util.py
+++ b/util.py
@@ -0,0 +1,33 @@
 from __future__ import annotations
 import json
 import sys
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional
 def utc_iso() -> str:
    return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
 class Logger:
    def __init__(self, log_path: Optional[str] = None):
        self.log_path = Path(log_path) if log_path else None
        if self.log_path:
            self.log_path.parent.mkdir(parents=True, exist_ok=True)
    def _write(self, level: str, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
        line = f"{utc_iso()} [{level}] {msg}"
        print(line, file=sys.stdout, flush=True)
        if self.log_path:
            payload = {"ts": utc_iso(), "level": level, "msg": msg, "extra": extra or {}}
            with self.log_path.open("a", encoding="utf-8") as f:
                f.write(json.dumps(payload, ensure_ascii=False) + "\n")
    def info(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
        self._write("INFO", msg, extra)
    def warn(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
        self._write("WARN", msg, extra)
    def error(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
        self._write("ERROR", msg, extra)