Initial commit

2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions
--- a/extract/clean.py
+++ b/extract/clean.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import re
+from collections import Counter
+import unicodedata
+
+RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I)
+RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U)
+RE_MULTI_SPACE = re.compile(r"[ \t]+")
+RE_MULTI_NL = re.compile(r"\n{3,}")
+
+_INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"]
+_BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]")
+
+def normalize_text(raw: str) -> str:
+    text = raw.replace("\r\n", "\n").replace("\r", "\n")
+    for ch in _INVISIBLE_CHARS:
+        text = text.replace(ch, "")
+    text = _BIDI_CTRL_RE.sub("", text)
+    # remove most control/format chars but keep line breaks and tabs
+    text = "".join(
+        ch for ch in text
+        if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C")
+    )
+    text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b"))
+    lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")]
+    lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)]
+    counts = Counter(lines)
+    filtered = []
+    for ln in lines:
+        if counts[ln] >= 4 and len(ln) <= 90:
+            continue
+        filtered.append(ln)
+    text = "\n".join(filtered)
+    text = RE_MULTI_NL.sub("\n\n", text).strip()
+    return text
+
+def to_fts_text(clean: str) -> str:
+    return re.sub(r"\s+", " ", clean).strip()
--- a/extract/doc_type.py
+++ b/extract/doc_type.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+@dataclass
+class DocTypeResult:
+    doc_type: str
+    confidence: float
+    signals: List[str]
+
+
+_HH_PATTERNS = [
+    (re.compile(r"\bhh\.ru\b", re.I), 2.0, "hh_domain"),
+    (re.compile(r"\bheadhunter\b", re.I), 2.0, "headhunter"),
+    (re.compile(r"\bрезюме\s+обновлено\b", re.I), 2.5, "hh_resume_updated"),
+    (re.compile(r"\bжелаемая\s+должность\b", re.I), 1.2, "hh_desired_role"),
+    (re.compile(r"\bключевые\s+навыки\b", re.I), 1.0, "hh_key_skills"),
+    (re.compile(r"\bопыт\s+работы\b", re.I), 0.8, "hh_experience"),
+]
+
+_LI_PATTERNS = [
+    (re.compile(r"\blinkedin\b", re.I), 2.5, "li_brand"),
+    (re.compile(r"\blinkedin\.com\b", re.I), 2.0, "li_domain"),
+    (re.compile(r"\bexperience\b", re.I), 0.9, "li_experience"),
+    (re.compile(r"\beducation\b", re.I), 0.9, "li_education"),
+    (re.compile(r"\bskills\b", re.I), 0.9, "li_skills"),
+    (re.compile(r"\babout\b", re.I), 0.6, "li_about"),
+]
+
+_PPTX_PATTERNS = [
+    (re.compile(r"\bslide\b", re.I), 1.0, "pptx_slide"),
+    (re.compile(r"\bpowerpoint\b", re.I), 1.3, "pptx_powerpoint"),
+    (re.compile(r"\bpptx\b", re.I), 1.3, "pptx_ext"),
+    (re.compile(r"\bpresentation\b", re.I), 0.8, "pptx_presentation"),
+]
+
+
+def _score_patterns(text: str, patterns) -> tuple[float, List[str]]:
+    score = 0.0
+    signals: List[str] = []
+    for rx, weight, name in patterns:
+        if rx.search(text):
+            score += weight
+            signals.append(name)
+    return score, signals
+
+
+def _confidence_from_score(score: float) -> float:
+    if score >= 4.0:
+        return 0.92
+    if score >= 3.0:
+        return 0.85
+    if score >= 2.0:
+        return 0.75
+    if score >= 1.2:
+        return 0.62
+    if score > 0.0:
+        return 0.50
+    return 0.30
+
+
+def detect_doc_type(clean_text: str, file_ext: Optional[str] = None) -> DocTypeResult:
+    lines = [ln.strip() for ln in (clean_text or "").splitlines() if ln.strip()]
+    head_lines = lines[:80]
+    head_text = "\n".join(head_lines)
+    head_lc = head_text.lower()
+
+    signals: List[str] = []
+
+    hh_score, hh_signals = _score_patterns(head_text, _HH_PATTERNS)
+    li_score, li_signals = _score_patterns(head_text, _LI_PATTERNS)
+    pptx_score, pptx_signals = _score_patterns(head_text, _PPTX_PATTERNS)
+    if file_ext and file_ext.lower() in (".pptx", ".ppt"):
+        pptx_score += 2.0
+        signals.append("pptx_ext")
+
+    signals.extend(hh_signals + li_signals + pptx_signals)
+
+    # One-page heuristic: short resumes with dense content
+    total_len = len(clean_text or "")
+    one_page_score = 0.0
+    if len(lines) <= 70 and total_len <= 4500:
+        one_page_score = 2.2
+        signals.append("one_page_short")
+    elif len(lines) <= 90 and total_len <= 6500:
+        one_page_score = 1.6
+        signals.append("one_page_medium")
+
+    # Scan heuristic: very low textual content
+    letters = sum(ch.isalpha() for ch in clean_text or "")
+    total = max(1, len(clean_text or ""))
+    letter_ratio = letters / total
+    scan_score = 0.0
+    if total_len < 200 or letter_ratio < 0.12:
+        scan_score = 3.2
+        signals.append("scan_low_text")
+        if file_ext and file_ext.lower() in (".pdf", ".png", ".jpg", ".jpeg", ".tiff"):
+            scan_score += 0.6
+            signals.append("scan_file_ext")
+
+    candidates = [
+        ("hh_ru", hh_score),
+        ("linkedin_pdf", li_score),
+        ("pptx_export", pptx_score),
+        ("one_page", one_page_score),
+        ("scan_pdf", scan_score),
+    ]
+    doc_type, best_score = max(candidates, key=lambda x: x[1])
+
+    if best_score <= 0.0:
+        base = "generic_pdf" if (file_ext and file_ext.lower() == ".pdf") else "generic"
+        return DocTypeResult(doc_type=base, confidence=0.35, signals=signals)
+
+    confidence = _confidence_from_score(best_score)
+    # If scan is detected strongly, prefer it
+    if doc_type == "scan_pdf" and confidence >= 0.8:
+        return DocTypeResult(doc_type="scan_pdf", confidence=confidence, signals=signals)
+
+    # Split one-page into ru/en
+    if doc_type == "one_page":
+        if _looks_cyrillic(head_text):
+            return DocTypeResult(doc_type="one_page_ru", confidence=confidence, signals=signals)
+        return DocTypeResult(doc_type="one_page_en", confidence=confidence, signals=signals)
+
+    return DocTypeResult(doc_type=doc_type, confidence=confidence, signals=signals)
+
+
+def _looks_cyrillic(text: str) -> bool:
+    cyr = len(re.findall(r"[А-Яа-яЁё]", text))
+    lat = len(re.findall(r"[A-Za-z]", text))
+    return cyr > lat and cyr >= 10
--- a/extract/experience.py
+++ b/extract/experience.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from datetime import date
+from typing import Dict, List, Optional, Tuple
+
+# Month maps (EN + RU)
+MONTHS = {
+    "jan": 1, "january": 1, "янв": 1, "январ": 1,
+    "feb": 2, "february": 2, "фев": 2, "феврал": 2,
+    "mar": 3, "march": 3, "мар": 3, "март": 3,
+    "apr": 4, "april": 4, "апр": 4, "апрел": 4,
+    "may": 5, "май": 5,
+    "jun": 6, "june": 6, "июн": 6, "июнь": 6,
+    "jul": 7, "july": 7, "июл": 7, "июль": 7,
+    "aug": 8, "august": 8, "авг": 8, "август": 8,
+    "sep": 9, "september": 9, "сен": 9, "сент": 9,
+    "oct": 10, "october": 10, "окт": 10, "октя": 10,
+    "nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
+    "dec": 12, "december": 12, "дек": 12, "дека": 12,
+}
+
+PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
+
+# Direct "X years" patterns
+DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
+
+# Dates like 03.2019, 2019, Jan 2020, янв 2020
+MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
+YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
+MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
+
+# Range separators
+RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
+
+@dataclass
+class ExpResult:
+    years: Optional[float]
+    confidence: float
+    debug: Dict
+
+def _clamp_years(y: float) -> Optional[float]:
+    if 0.0 <= y <= 45.0:
+        return y
+    return None
+
+def _parse_mon(mon: str) -> Optional[int]:
+    m = mon.strip().lower()
+    m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
+    # allow prefixes: "январ", "феврал"
+    for k, v in MONTHS.items():
+        if m.startswith(k):
+            return v
+    return None
+
+def _as_ymd(y: int, m: int) -> date:
+    return date(y, m, 1)
+
+def _parse_one_date(s: str) -> Optional[date]:
+    s = s.strip()
+    if PRESENT_RE.search(s):
+        today = date.today()
+        return date(today.year, today.month, 1)
+
+    m1 = MMYYYY_RE.search(s)
+    if m1:
+        mm = int(m1.group(1))
+        yy = int(m1.group(2))
+        return _as_ymd(yy, mm)
+
+    m2 = MON_YYYY_RE.search(s)
+    if m2:
+        mon = _parse_mon(m2.group(1))
+        yy = int(m2.group(2))
+        if mon:
+            return _as_ymd(yy, mon)
+
+    m3 = YYYY_RE.search(s)
+    if m3:
+        yy = int(m3.group(1))
+        return _as_ymd(yy, 1)
+
+    return None
+
+def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
+    if not intervals:
+        return []
+    intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
+    merged = [intervals[0]]
+    for s, e in intervals[1:]:
+        ls, le = merged[-1]
+        if s <= le:
+            merged[-1] = (ls, max(le, e))
+        else:
+            merged.append((s, e))
+    return merged
+
+def _months_between(a: date, b: date) -> int:
+    # month-level difference (inclusive-ish): b >= a
+    return (b.year - a.year) * 12 + (b.month - a.month)
+
+def extract_experience(text: str) -> ExpResult:
+    debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
+
+    # 1) Direct years
+    directs = []
+    for m in DIRECT_YEARS_RE.finditer(text):
+        try:
+            v = float(m.group(1).replace(",", "."))
+            if 0 <= v <= 45:
+                directs.append(v)
+                debug["direct_matches"].append({"match": m.group(0), "value": v})
+        except Exception:
+            pass
+    if directs:
+        years = _clamp_years(max(directs))
+        return ExpResult(years=years, confidence=0.90, debug=debug)
+
+    # 2) Ranges in lines: try to detect "start - end"
+    intervals: List[Tuple[date, date]] = []
+    for line in text.splitlines():
+        ln = line.strip()
+        if len(ln) < 7:
+            continue
+        # require range separator
+        if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
+            continue
+        rr = RANGE_RE.match(ln)
+        if not rr:
+            continue
+        a = rr.group("a")
+        b = rr.group("b")
+        da = _parse_one_date(a)
+        db = _parse_one_date(b)
+        if da and db:
+            if db < da:
+                da, db = db, da
+            # cap extremely old
+            if da.year < 1990:
+                continue
+            intervals.append((da, db))
+            debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
+
+    intervals = _merge_intervals(intervals)
+    debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
+
+    if not intervals:
+        return ExpResult(years=None, confidence=0.0, debug=debug)
+
+    total_months = 0
+    for s, e in intervals:
+        total_months += max(0, _months_between(s, e))
+    years = round(total_months / 12.0, 2)
+    years = _clamp_years(years) if years is not None else None
+
+    # confidence depends on amount of evidence
+    conf = 0.70 if total_months >= 12 else 0.55
+    return ExpResult(years=years, confidence=conf, debug=debug)
--- a/extract/experience_timeline.py
+++ b/extract/experience_timeline.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, asdict
+from datetime import date
+from typing import List, Optional
+
+MONTHS = {
+    "jan": 1, "january": 1, "янв": 1, "январ": 1,
+    "feb": 2, "february": 2, "фев": 2, "феврал": 2,
+    "mar": 3, "march": 3, "мар": 3, "март": 3,
+    "apr": 4, "april": 4, "апр": 4, "апрел": 4,
+    "may": 5, "май": 5,
+    "jun": 6, "june": 6, "июн": 6, "июнь": 6,
+    "jul": 7, "july": 7, "июл": 7, "июль": 7,
+    "aug": 8, "august": 8, "авг": 8, "август": 8,
+    "sep": 9, "september": 9, "сен": 9, "сент": 9,
+    "oct": 10, "october": 10, "окт": 10, "октя": 10,
+    "nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
+    "dec": 12, "december": 12, "дек": 12, "дека": 12,
+}
+
+PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
+MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
+YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
+MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
+RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
+YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—|–|-|to|по)\s*\d{4}\s*$", re.I)
+EDU_CONTEXT_RE = re.compile(
+    r"\b("
+    r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
+    r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
+    r")\b",
+    re.I,
+)
+
+
+@dataclass
+class Position:
+    title: Optional[str]
+    company: Optional[str]
+    date_from: Optional[str]
+    date_to: Optional[str]
+    is_current: Optional[bool]
+    description: Optional[str]
+
+
+def _parse_mon(mon: str) -> Optional[int]:
+    m = mon.strip().lower()
+    m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
+    for k, v in MONTHS.items():
+        if m.startswith(k):
+            return v
+    return None
+
+
+def _as_ymd(y: int, m: int) -> date:
+    return date(y, m, 1)
+
+
+def _parse_one_date(s: str) -> Optional[date]:
+    s = s.strip()
+    if PRESENT_RE.search(s):
+        today = date.today()
+        return date(today.year, today.month, 1)
+    m1 = MMYYYY_RE.search(s)
+    if m1:
+        mm = int(m1.group(1))
+        yy = int(m1.group(2))
+        return _as_ymd(yy, mm)
+    m2 = MON_YYYY_RE.search(s)
+    if m2:
+        mon = _parse_mon(m2.group(1))
+        yy = int(m2.group(2))
+        if mon:
+            return _as_ymd(yy, mon)
+    m3 = YYYY_RE.search(s)
+    if m3:
+        yy = int(m3.group(1))
+        return _as_ymd(yy, 1)
+    return None
+
+
+def extract_positions(text: str, max_items: int = 40) -> List[Position]:
+    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
+    positions: List[Position] = []
+    i = 0
+    while i < len(lines) and len(positions) < max_items:
+        ln = lines[i]
+        if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
+            i += 1
+            continue
+        rr = RANGE_RE.match(ln)
+        if not rr:
+            i += 1
+            continue
+        ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
+        if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
+            i += 1
+            continue
+        da = _parse_one_date(rr.group("a"))
+        db = _parse_one_date(rr.group("b"))
+        if not da or not db:
+            i += 1
+            continue
+        if da.year < 1990:
+            i += 1
+            continue
+        is_current = PRESENT_RE.search(rr.group("b")) is not None
+        title = None
+        company = None
+        desc_lines: List[str] = []
+        if i + 1 < len(lines):
+            if EDU_CONTEXT_RE.search(lines[i + 1]):
+                i += 1
+                continue
+            header = lines[i + 1]
+            parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
+            if parts:
+                title = parts[0]
+                if len(parts) > 1:
+                    company = parts[1]
+        j = i + 2
+        while j < len(lines):
+            if any(x in lines[j] for x in ("—", "–", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
+                break
+            desc_lines.append(lines[j])
+            j += 1
+        positions.append(
+            Position(
+                title=title,
+                company=company,
+                date_from=da.isoformat(),
+                date_to=db.isoformat(),
+                is_current=is_current,
+                description="\n".join(desc_lines).strip() if desc_lines else None,
+            )
+        )
+        i = j
+    return positions
+
+
+def positions_to_dicts(items: List[Position]) -> List[dict]:
+    return [asdict(p) for p in items]
--- a/extract/llm.py
+++ b/extract/llm.py
@@ -0,0 +1,585 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import re
+import sqlite3
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+try:
+    import httpx  # type: ignore
+except Exception:  # pragma: no cover
+    httpx = None  # type: ignore
+
+
+def resolve_llm_runtime() -> Dict[str, str]:
+    """
+    Resolve OpenAI-compatible runtime config.
+    Supports both generic vars and Mistral aliases:
+      - generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
+      - mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
+    """
+    provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
+    base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
+    model = (os.environ.get("LLM_MODEL") or "").strip()
+    api_key = (os.environ.get("LLM_API_KEY") or "").strip()
+
+    mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
+    mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
+    mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()
+
+    if not api_key and mistral_key:
+        api_key = mistral_key
+    if not model and mistral_model:
+        model = mistral_model
+    if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
+        base_url = mistral_base
+
+    if base_url:
+        base_url = base_url.rstrip("/")
+
+    if not provider:
+        if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
+            provider = "mistral"
+        else:
+            provider = "generic"
+
+    return {
+        "provider": provider,
+        "base_url": base_url,
+        "model": model,
+        "api_key": api_key,
+    }
+
+
+# ------------- Public API -------------
+
+def llm_parse_enabled() -> bool:
+    """
+    Enabled only if httpx is available and both base_url/model are resolved.
+    Opt-out via LLM_PARSE_ENABLED=0.
+    """
+    if httpx is None:
+        return False
+    if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
+        return False
+    runtime = resolve_llm_runtime()
+    return bool(runtime["base_url"]) and bool(runtime["model"])
+
+
+_PROMPT_VERSION = "v3_sections_doc_type"
+_REVIEW_PROMPT_VERSION = "v1_review_merge"
+
+
+@dataclass
+class LLMExtraction:
+    roles: List[str]
+    skills: List[str]
+    primary_languages: List[str]
+    seniority: Optional[str]
+    backend_focus: Optional[bool]
+    experience_years_total: Optional[float]
+    experience_years_engineering: Optional[float]
+    english_level: Optional[str]
+    location: Optional[str]
+    remote_ok: Optional[bool]
+    salary_min_usd: Optional[int]
+    salary_max_usd: Optional[int]
+    salary_min_rub: Optional[int]
+    salary_max_rub: Optional[int]
+    highlights: List[str]
+    keywords: List[str]
+
+    @staticmethod
+    def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
+        def _as_list(v: Any) -> List[str]:
+            if v is None:
+                return []
+            if isinstance(v, list):
+                return [str(x).strip() for x in v if str(x).strip()]
+            s = str(v).strip()
+            return [s] if s else []
+
+        def _as_float(v: Any) -> Optional[float]:
+            try:
+                return float(v)
+            except Exception:
+                return None
+
+        def _as_int(v: Any) -> Optional[int]:
+            try:
+                return int(float(v))
+            except Exception:
+                return None
+
+        def _as_bool(v: Any) -> Optional[bool]:
+            if isinstance(v, bool):
+                return v
+            if v is None:
+                return None
+            s = str(v).strip().lower()
+            if s in ("true", "1", "yes", "y"):
+                return True
+            if s in ("false", "0", "no", "n"):
+                return False
+            return None
+
+        return LLMExtraction(
+            roles=_as_list(obj.get("roles")),
+            skills=_as_list(obj.get("skills")),
+            primary_languages=_as_list(obj.get("primary_languages")),
+            seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
+            backend_focus=_as_bool(obj.get("backend_focus")),
+            experience_years_total=_as_float(obj.get("experience_years_total")),
+            experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
+            english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
+            location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
+            remote_ok=_as_bool(obj.get("remote_ok")),
+            salary_min_usd=_as_int(obj.get("salary_min_usd")),
+            salary_max_usd=_as_int(obj.get("salary_max_usd")),
+            salary_min_rub=_as_int(obj.get("salary_min_rub")),
+            salary_max_rub=_as_int(obj.get("salary_max_rub")),
+            highlights=_as_list(obj.get("highlights")),
+            keywords=_as_list(obj.get("keywords")),
+        )
+
+
+def llm_extract_profile(
+    clean_text: str,
+    *,
+    con: Optional[sqlite3.Connection] = None,
+    doc_type: Optional[str] = None,
+    sections: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
+    """
+    Returns (LLMExtraction | None, debug_info).
+    - Uses cache on disk/sqlite to keep throughput high.
+    - Silently degrades to None on any failure.
+    """
+    runtime = resolve_llm_runtime()
+    dbg: Dict[str, Any] = {
+        "enabled": llm_parse_enabled(),
+        "provider": runtime.get("provider"),
+        "model": runtime.get("model"),
+        "from_cache": False,
+        "cache_backend": None,
+        "error": None,
+        "prompt_version": _PROMPT_VERSION,
+    }
+    if not llm_parse_enabled():
+        return None, dbg
+
+    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
+    cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"
+
+    payload = _build_payload(
+        clean_text,
+        doc_type=doc_type,
+        sections=sections,
+        prompt_version=_PROMPT_VERSION,
+        temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
+        max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
+        system_prompt="You output ONLY JSON for structured resume extraction.",
+        prompt_template=_PROMPT_TEMPLATE,
+    )
+
+    data = _cached_llm_json_call(
+        con=con,
+        cache_key=cache_key,
+        model=runtime["model"],
+        payload=payload,
+        dbg=dbg,
+    )
+    if data is None:
+        return None, dbg
+    return LLMExtraction.from_obj(data), dbg
+
+
+def llm_review_profile(
+    clean_text: str,
+    *,
+    draft: Dict[str, Any],
+    con: Optional[sqlite3.Connection] = None,
+    doc_type: Optional[str] = None,
+    sections: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
+    """
+    Second-pass validator:
+    - Takes already parsed JSON (draft)
+    - Re-checks every field against resume text
+    - Returns corrected extraction for safe merge in pipeline
+    """
+    runtime = resolve_llm_runtime()
+    dbg: Dict[str, Any] = {
+        "enabled": llm_parse_enabled(),
+        "provider": runtime.get("provider"),
+        "model": runtime.get("model"),
+        "from_cache": False,
+        "cache_backend": None,
+        "error": None,
+        "prompt_version": _REVIEW_PROMPT_VERSION,
+        "quality_score": None,
+        "changed_fields": [],
+        "issues_found": [],
+    }
+    if not llm_parse_enabled():
+        return None, dbg
+
+    clean_draft = _sanitize_review_draft(draft)
+    draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
+    text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
+    draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
+    cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"
+
+    payload = _build_payload(
+        clean_text,
+        doc_type=doc_type,
+        sections=sections,
+        prompt_version=_REVIEW_PROMPT_VERSION,
+        temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
+        max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
+        system_prompt="You output ONLY JSON for resume parsing quality review.",
+        prompt_template=_REVIEW_PROMPT_TEMPLATE,
+        extra_vars={"draft_json": draft_blob},
+    )
+
+    data = _cached_llm_json_call(
+        con=con,
+        cache_key=cache_key,
+        model=runtime["model"],
+        payload=payload,
+        dbg=dbg,
+    )
+    if data is None:
+        return None, dbg
+
+    corrected_obj: Dict[str, Any]
+    if isinstance(data.get("corrected"), dict):
+        corrected_obj = data["corrected"]
+    else:
+        corrected_obj = data
+
+    dbg["quality_score"] = _as_float(data.get("quality_score"))
+    dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
+    dbg["issues_found"] = _as_str_list(data.get("issues_found"))
+
+    return LLMExtraction.from_obj(corrected_obj), dbg
+
+
+# ------------- Internal helpers -------------
+
+_PROMPT_TEMPLATE = """
+Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
+Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
+Схема:
+{{
+  "roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
+  "skills": ["python","go","k8s","postgres","react", "..."],
+  "primary_languages": ["python","go","java","c++", "..."],
+  "seniority": "intern|junior|middle|senior|lead|principal|null",
+  "backend_focus": true|false|null,
+  "experience_years_total": number|null,
+  "experience_years_engineering": number|null,
+  "english_level": "A1|A2|B1|B2|C1|C2|null",
+  "location": "city, country|null",
+  "remote_ok": true|false|null,
+  "salary_min_usd": int|null,
+  "salary_max_usd": int|null,
+  "salary_min_rub": int|null,
+  "salary_max_rub": int|null,
+  "highlights": ["кратко достижения (1-2 предложения)"],
+  "keywords": ["уникальные ключевые слова, продукты или домены"]
+}}
+Не включай контактные данные в skills/keywords.
+Detected doc_type: {doc_type}
+Sections (if present):
+{sections_block}
+
+Full text snippet (use only if needed):
+```TEXT
+{resume_text}
+```
+"""
+
+_REVIEW_PROMPT_TEMPLATE = """
+Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
+У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
+Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.
+
+Верни JSON строго такой формы:
+{{
+  "corrected": {{
+    "roles": ["..."],
+    "skills": ["..."],
+    "primary_languages": ["..."],
+    "seniority": "intern|junior|middle|senior|lead|principal|null",
+    "backend_focus": true|false|null,
+    "experience_years_total": number|null,
+    "experience_years_engineering": number|null,
+    "english_level": "A1|A2|B1|B2|C1|C2|null",
+    "location": "city, country|null",
+    "remote_ok": true|false|null,
+    "salary_min_usd": int|null,
+    "salary_max_usd": int|null,
+    "salary_min_rub": int|null,
+    "salary_max_rub": int|null,
+    "highlights": ["..."],
+    "keywords": ["..."]
+  }},
+  "changed_fields": ["field_name", "..."],
+  "issues_found": ["кратко что было неверно/сомнительно", "..."],
+  "quality_score": 0.0
+}}
+
+Черновик JSON:
+```DRAFT
+{draft_json}
+```
+
+Detected doc_type: {doc_type}
+Sections (if present):
+{sections_block}
+
+Full text snippet (use only if needed):
+```TEXT
+{resume_text}
+```
+"""
+
+
+def _trim_text(text: str, max_len: int = 9000) -> str:
+    """
+    Keep head and tail to preserve summary + recent projects.
+    """
+    if len(text) <= max_len:
+        return text
+    head = text[: max_len // 2]
+    tail = text[-max_len // 2 :]
+    return head + "\n...\n" + tail
+
+
+def _build_payload(
+    clean_text: str,
+    *,
+    doc_type: Optional[str],
+    sections: Optional[Dict[str, str]],
+    prompt_version: str,
+    temperature: float,
+    max_tokens: int,
+    system_prompt: str,
+    prompt_template: str,
+    extra_vars: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    runtime = resolve_llm_runtime()
+    base_url = runtime["base_url"]
+    model = runtime["model"]
+
+    sections_block = _build_sections_block(sections)
+    tpl_vars = {
+        "resume_text": _trim_text(clean_text),
+        "doc_type": (doc_type or "unknown"),
+        "sections_block": sections_block or "(no sections detected)",
+    }
+    if extra_vars:
+        tpl_vars.update(extra_vars)
+
+    prompt = prompt_template.format(**tpl_vars)
+
+    return {
+        "base_url": base_url,
+        "model": model,
+        "prompt_version": prompt_version,
+        "payload": {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        },
+        "headers": _build_headers(runtime),
+        "timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
+    }
+
+
+def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
+    headers = {"Content-Type": "application/json"}
+    api_key = runtime.get("api_key", "")
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    return headers
+
+
+def _cached_llm_json_call(
+    *,
+    con: Optional[sqlite3.Connection],
+    cache_key: str,
+    model: str,
+    payload: Dict[str, Any],
+    dbg: Dict[str, Any],
+) -> Optional[Dict[str, Any]]:
+    data = _cache_get_sqlite(con, cache_key)
+    if data:
+        dbg["from_cache"] = True
+        dbg["cache_backend"] = "sqlite"
+        return data
+
+    cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
+    cache_ok = True
+    try:
+        cache_dir.mkdir(parents=True, exist_ok=True)
+    except Exception:
+        cache_ok = False
+
+    safe_name = cache_key.replace(":", "_")
+    cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None
+
+    if cache_path and cache_path.exists():
+        try:
+            data = json.loads(cache_path.read_text(encoding="utf-8"))
+            dbg["from_cache"] = True
+            dbg["cache_backend"] = "disk"
+            return data
+        except Exception:
+            pass
+
+    try:
+        data = _llm_call_json(payload)
+        if con:
+            _cache_put_sqlite(con, cache_key, model, data)
+        if cache_path:
+            cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
+        return data
+    except Exception as e:  # pragma: no cover - network/LLM failures
+        dbg["error"] = repr(e)
+        return None
+
+
+def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
+    if httpx is None:
+        raise RuntimeError("httpx is not installed")
+
+    base_url: str = task["base_url"]
+    payload: Dict[str, Any] = task["payload"]
+    timeout = float(task.get("timeout", 18.0))
+
+    with httpx.Client(timeout=timeout) as client:
+        r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
+        r.raise_for_status()
+        data = r.json()
+
+    content = data["choices"][0]["message"]["content"]
+    if isinstance(content, list):
+        parts = []
+        for block in content:
+            if isinstance(block, dict):
+                parts.append(str(block.get("text") or ""))
+            else:
+                parts.append(str(block))
+        content = "\n".join(parts)
+    content = str(content)
+
+    m = re.search(r"\{.*\}", content, flags=re.S)
+    if not m:
+        raise ValueError("LLM did not return JSON")
+    return json.loads(m.group(0))
+
+
+def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
+    if not sections:
+        return ""
+    parts: List[str] = []
+    order = [
+        ("about", "ABOUT"),
+        ("skills", "SKILLS"),
+        ("experience", "EXPERIENCE"),
+        ("education", "EDUCATION"),
+        ("contacts", "CONTACTS"),
+    ]
+    for key, label in order:
+        text = sections.get(key)
+        if not text:
+            continue
+        snippet = _trim_text(text, max_len=1800)
+        parts.append(f"[{label}]\n{snippet}")
+    return "\n\n".join(parts)
+
+
+def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
+    if not isinstance(draft, dict):
+        draft = {}
+
+    allowed = {
+        "roles",
+        "skills",
+        "primary_languages",
+        "seniority",
+        "backend_focus",
+        "experience_years_total",
+        "experience_years_engineering",
+        "english_level",
+        "location",
+        "remote_ok",
+        "salary_min_usd",
+        "salary_max_usd",
+        "salary_min_rub",
+        "salary_max_rub",
+        "highlights",
+        "keywords",
+    }
+    cleaned = {k: v for k, v in draft.items() if k in allowed}
+    return asdict(LLMExtraction.from_obj(cleaned))
+
+
+def _as_float(v: Any) -> Optional[float]:
+    try:
+        x = float(v)
+    except Exception:
+        return None
+    if x < 0:
+        return None
+    if x > 1.0:
+        return 1.0
+    return x
+
+
+def _as_str_list(v: Any) -> List[str]:
+    if v is None:
+        return []
+    if isinstance(v, list):
+        return [str(x).strip() for x in v if str(x).strip()]
+    s = str(v).strip()
+    return [s] if s else []
+
+
+def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
+    if con is None:
+        return None
+    try:
+        row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
+        if row and row["result_json"]:
+            return json.loads(row["result_json"])
+    except Exception:
+        return None
+    return None
+
+
+def _cache_put_sqlite(
+    con: Optional[sqlite3.Connection],
+    cache_key: str,
+    model: str,
+    data: Dict[str, Any],
+) -> None:
+    if con is None:
+        return
+    try:
+        con.execute(
+            "INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
+            (cache_key, model, json.dumps(data, ensure_ascii=False)),
+        )
+    except Exception:
+        return
--- a/extract/parse.py
+++ b/extract/parse.py
@@ -0,0 +1,659 @@
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+from tg_resume_db.normalize import normalize_skill
+from tg_resume_db.extract.experience import extract_experience
+
+EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
+EMAIL_SPLIT_RE = re.compile(
+    r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
+    r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
+    re.I,
+)
+PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
+TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
+GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
+LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
+URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
+
+EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
+EN_TEXT_RE = re.compile(
+    r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
+    re.I,
+)
+EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
+
+REMOTE_RE = re.compile(
+    r"\b("
+    r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
+    r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
+    r")\b",
+    re.I,
+)
+
+# Salary (rough)
+CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
+NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
+SALARY_HINT_RE = re.compile(
+    r"\b("
+    r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
+    r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
+    r")\b",
+    re.I,
+)
+PAY_TOKEN_RE = re.compile(
+    r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
+    re.I,
+)
+SALARY_NOISE_RE = re.compile(
+    r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
+    r"companies?|followers?|downloads?|clients?)\b",
+    re.I,
+)
+
+SECTION_HEADER_RE = re.compile(
+    r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
+    re.I,
+)
+LOCATION_CITY_COUNTRY_RE = re.compile(
+    r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
+)
+
+# --- SKILLS & ROLES ---
+
+SKILLS = {
+    "python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
+    "sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
+    "aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
+    "pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
+}
+
+_SKILL_ALIASES: Dict[str, List[str]] = {
+    "javascript": ["java script", "java-script", "js"],
+    "typescript": ["type script", "type-script", "ts"],
+    "postgresql": ["postgres", "postgre sql", "postgre-sql"],
+    "graphql": ["graph ql"],
+    "grpc": ["g rpc"],
+}
+
+
+def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
+    patterns: List[Tuple[str, re.Pattern]] = []
+    for skill in sorted(SKILLS):
+        aliases = [skill] + _SKILL_ALIASES.get(skill, [])
+        for alias in aliases:
+            if skill == "java" and alias == "java":
+                # Do not match "java" inside "java script".
+                pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
+            else:
+                pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
+            patterns.append((skill, pat))
+    return patterns
+
+
+_SKILL_PATTERNS = _build_skill_patterns()
+
+ROLES = {
+    "backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
+    "mobile","android","ios","team lead","tech lead","architect"
+}
+
+_ROLE_ALIASES: Dict[str, List[str]] = {
+    "backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
+    "frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
+    "fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
+    "devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
+    "qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
+    "sre": ["sre", "site reliability"],
+    "data engineer": ["data engineer"],
+    "data scientist": ["data scientist"],
+    "ml engineer": ["ml engineer", "machine learning engineer"],
+    "mobile": ["mobile developer", "mobile engineer"],
+    "android": ["android developer", "android engineer"],
+    "ios": ["ios developer", "ios engineer"],
+    "team lead": ["team lead", "teamlead"],
+    "tech lead": ["tech lead", "techlead"],
+    "architect": ["architect", "solution architect", "software architect"],
+}
+
+
+def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
+    out: Dict[str, List[re.Pattern]] = {}
+    for role in ROLES:
+        aliases = _ROLE_ALIASES.get(role, [role])
+        out[role] = [
+            re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
+            for a in aliases
+        ]
+    return out
+
+
+_ROLE_PATTERNS = _build_role_patterns()
+
+# --- HR / RECRUITER FILTERS ---
+# Words that indicate the line is about searching for candidates, not owning the skill.
+HR_CONTEXT_RE = re.compile(
+    r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
+    r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
+    re.I
+)
+
+# Roles that explicitly define the person as Non-Engineering
+NON_TECH_ROLES_RE = re.compile(
+    r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
+    re.I
+)
+
+# --- EXPERIENCE ---
+
+AGE_LINE_RE = re.compile(
+    r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
+)
+
+EXP_HEADER_RE = re.compile(
+    r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
+)
+
+# "5 years 10 months"
+EXP_SUMMARY_RE = re.compile(
+    r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
+    r"[^0-9]{0,20}"
+    r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
+    r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
+)
+
+EXP_NEARBY_RE = re.compile(
+    r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
+    r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
+)
+
+HH_FOOTER_RE = re.compile(
+    r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
+    re.I,
+)
+NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
+NAME_LINE_RE = re.compile(
+    r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
+)
+NAME_STOPWORDS = {
+    "resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
+    "projects", "about", "profile", "objective", "навыки", "опыт", "образование",
+    "контакты", "профиль", "цель", "резюме",
+    "developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
+    "backend developer", "frontend developer", "fullstack developer", "software engineer",
+    "разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
+    "top skills", "experience", "education", "languages", "certifications",
+    "skills & endorsements", "endorsements",
+    "university", "state university", "institute", "college", "academy", "school",
+    "bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
+    "колледж", "школа", "бакалавр", "магистр", "факультет",
+}
+
+_NAME_BAD_WORDS = {
+    "skills", "top skills", "experience", "education", "languages", "certifications",
+    "projects", "summary", "about", "profile", "endorsements",
+    "university", "institute", "college", "academy", "school",
+    "bachelor", "master", "degree", "faculty",
+}
+
+NAME_INSTITUTION_RE = re.compile(
+    r"\b("
+    r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
+    r"mathematics|computer science|informatics|physics|economics|management|"
+    r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
+    r"математик|информатик|физик|экономик|менеджмент"
+    r")\b",
+    re.I,
+)
+
+_EMAIL_PREFIX_STOP = {
+    "email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
+}
+
+
+def _prune_fragment_emails(values: List[str]) -> List[str]:
+    uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
+    out: List[str] = []
+    for e in uniq:
+        local, domain = e.split("@", 1)
+        drop = False
+        for other in uniq:
+            if other == e:
+                continue
+            ol, od = other.split("@", 1)
+            if od != domain:
+                continue
+            if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
+                drop = True
+                break
+        if not drop:
+            out.append(e)
+    return out
+
+
+def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
+    """
+    Returns (total_years, engineering_years, confidence, debug).
+    
+    Logic:
+    1. Calculate TOTAL experience from summaries.
+    2. Check if the candidate is primarily a Recruiter/HR.
+       - If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
+       - If NO: engineering_years = total_years (Optimistic assumption for valid devs).
+    """
+    dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
+    
+    total_years: Optional[float] = None
+    confidence = 0.0
+
+    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
+    
+    # 1. Detect if Recruiter
+    # Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
+    header_text = "\n".join(lines[:15])
+    is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
+    dbg["is_recruiter"] = is_recruiter
+
+    # 2. Extract Total Duration
+    if lines:
+        # Strategy A: Explicit summary
+        for i, ln in enumerate(lines[:200]): 
+            if AGE_LINE_RE.search(ln): continue
+            
+            # Look for summary line
+            if EXP_HEADER_RE.search(ln):
+                window = ln
+                if i + 1 < len(lines): window += " " + lines[i+1]
+                if i + 2 < len(lines): window += " " + lines[i+2]
+                
+                m = EXP_SUMMARY_RE.search(window)
+                if m:
+                    y = int(m.group("y"))
+                    mm = int(m.group("m")) if m.group("m") else 0
+                    total_years = float(round(y + (mm / 12.0), 2))
+                    if 0 <= total_years <= 60:
+                        dbg["method"] = "summary"
+                        dbg["matched"] = m.group(0)
+                        confidence = 0.95
+                        break
+        
+        # Strategy B: Fallback nearby
+        if total_years is None:
+            safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
+            for i, ln in enumerate(safe_lines):
+                if not EXP_HEADER_RE.search(ln): continue
+                chunk = " ".join(safe_lines[i : i + 12])
+                m = EXP_NEARBY_RE.search(chunk)
+                if m:
+                    y = int(m.group("y"))
+                    mm = int(m.group("m")) if m.group("m") else 0
+                    val = float(round(y + (mm / 12.0), 2))
+                    if 0 <= val <= 60:
+                        total_years = val
+                        dbg["method"] = "header_chunk"
+                        dbg["matched"] = m.group(0)
+                        confidence = 0.80
+                        break
+
+    # 2.5 Timeline/range fallback-reconciliation
+    # Protects against cases where summary parser catches one short fragment
+    # while CV has a long timeline.
+    try:
+        alt = extract_experience(text or "")
+    except Exception:
+        alt = None
+    if alt and alt.years is not None:
+        if total_years is None:
+            total_years = alt.years
+            confidence = max(confidence, alt.confidence)
+            dbg["method"] = "timeline_fallback"
+            dbg["matched"] = "date_ranges"
+        elif alt.years > (total_years + 1.0):
+            strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
+            if strong_summary and (alt.years - float(total_years)) > 1.5:
+                dbg["reconcile"] = "timeline_skip_strong_summary"
+            else:
+                total_years = alt.years
+                confidence = max(confidence, min(0.82, alt.confidence))
+                dbg["method"] = "timeline_reconcile"
+                dbg["matched"] = "date_ranges"
+
+    # 3. Calculate Engineering Years
+    eng_years = total_years
+    if is_recruiter:
+        # If they are a recruiter, their "engineering" experience is effectively 0 
+        # for the purpose of finding a Developer.
+        eng_years = 0.0
+    
+    return total_years, eng_years, confidence, dbg
+
+
+def _norm_phone(p: str) -> str:
+    digits = re.sub(r"\D+", "", p)
+    if digits.startswith("8") and len(digits) == 11:
+        digits = "7" + digits[1:]
+    return "+" + digits if digits else ""
+
+def _norm_token(s: str) -> str:
+    return re.sub(r"\s+", " ", s.strip().lower())
+
+def safe_json(v) -> str:
+    return json.dumps(v, ensure_ascii=False)
+
+def extract_contacts(text: str) -> Dict[str, List[str]]:
+    emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
+    for m in EMAIL_SPLIT_RE.finditer(text or ""):
+        prefix = m.group("prefix").strip().lower().strip(".-_")
+        if not prefix or prefix in _EMAIL_PREFIX_STOP:
+            continue
+        if not re.search(r"[._\-\d]", prefix):
+            continue
+        tail = m.group("tail").lower()
+        if "@" not in tail:
+            continue
+        local_tail, domain = tail.split("@", 1)
+        local = f"{prefix}{local_tail}"
+        if len(local) > 64:
+            continue
+        cand = f"{local}@{domain}"
+        if EMAIL_RE.fullmatch(cand):
+            emails_set.add(cand)
+    emails = _prune_fragment_emails(sorted(emails_set))
+    phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
+    tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
+    gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
+    li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
+    urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
+    return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
+
+def extract_name_guess(text: str) -> Optional[str]:
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    if not lines:
+        return None
+
+    # 1) HH footer "Name • Резюме обновлено ..."
+    m = HH_FOOTER_RE.search(text or "")
+    if m:
+        cand = m.group("name").strip()
+        if _looks_like_name_line(cand):
+            return cand
+
+    # 2) Key-value line: "Name: ..." / "Имя: ..."
+    for ln in lines[:40]:
+        m2 = NAME_KV_RE.match(ln)
+        if m2:
+            cand = m2.group(2).strip()
+            cand = re.split(r"[|,/;]", cand)[0].strip()
+            if _looks_like_name_line(cand):
+                return cand
+
+    # 3) Name-like in first ~40 lines
+    for ln in lines[:40]:
+        if _looks_like_heading_line(ln):
+            continue
+        if _looks_like_name_line(ln):
+            return ln
+
+    # 4) Name-like near the end (pptx exports often put name there)
+    tail_start = max(0, len(lines) - 60)
+    for i in range(tail_start, len(lines)):
+        ln = lines[i]
+        if _looks_like_heading_line(ln):
+            continue
+        ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
+        if NAME_INSTITUTION_RE.search(ctx):
+            continue
+        if _looks_like_name_line(ln):
+            return ln
+
+    return None
+
+
+def _looks_like_heading_line(line: str) -> bool:
+    low = (line or "").strip().lower()
+    if not low:
+        return False
+    if low in _NAME_BAD_WORDS:
+        return True
+    if low.startswith("top skills"):
+        return True
+    if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
+        return True
+    return False
+
+
+def _looks_like_name_line(line: str) -> bool:
+    if not line:
+        return False
+    if len(line) > 80:
+        return False
+    low = line.lower().strip()
+    if low in NAME_STOPWORDS:
+        return False
+    if _looks_like_heading_line(line):
+        return False
+    if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
+        return False
+    if NAME_INSTITUTION_RE.search(line):
+        return False
+    if not NAME_LINE_RE.match(line.strip()):
+        return False
+    return True
+
+def extract_remote(text: str) -> Optional[bool]:
+    if not text:
+        return None
+    for ln in text.splitlines()[:120]:
+        if REMOTE_RE.search(ln):
+            return True
+    return None
+
+def extract_english(text: str) -> Optional[str]:
+    t = text or ""
+    lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
+
+    # 1) CEFR levels anywhere are accepted.
+    m = EN_RE.search(t)
+    if m:
+        return m.group(1).replace("+", "").upper()
+
+    # 2) Textual levels only when English context is present.
+    candidate_chunks: List[str] = []
+    for i, ln in enumerate(lines):
+        if EN_LANG_RE.search(ln):
+            candidate_chunks.append(ln)
+            if i + 1 < len(lines):
+                candidate_chunks.append(lines[i + 1])
+
+    if not candidate_chunks:
+        return None
+
+    m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
+    if not m2:
+        return None
+    word = m2.group(1).lower()
+    if word in ("native", "fluent", "proficient", "advanced"):
+        return "C1"
+    if word.startswith("upper"):
+        return "B2"
+    if word == "intermediate":
+        return "B1"
+    if word == "elementary":
+        return "A2"
+    return None
+
+def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
+    """
+    Extracts roles and skills, but strictly filters out HR/Recruitment context.
+    """
+    lines = text.splitlines()
+    
+    # 1. Filter text: Remove lines that talk about hiring/vacancies
+    clean_lines = []
+    for ln in lines:
+        if not HR_CONTEXT_RE.search(ln):
+            clean_lines.append(ln)
+    
+    clean_text = "\n".join(clean_lines).lower()
+    
+    # 2. Extract Skills from clean text only
+    skills = []
+    for s, pat in _SKILL_PATTERNS:
+        if pat.search(clean_text):
+            skills.append(normalize_skill(s) or s)
+    skills = sorted(set(skills))
+    
+    # 3. Extract Roles
+    # Priority: Header (first 10 lines)
+    header_text = "\n".join(lines[:10]).lower()
+    
+    found_roles = set()
+    
+    # Check if Recruiter
+    if NON_TECH_ROLES_RE.search(header_text):
+        # If explicit recruiter in header, do NOT add generic tech roles like "backend"
+        # even if they appear in the text (often describes who they hire).
+        pass
+    else:
+        # Normal extraction
+        for r in ROLES:
+            pats = _ROLE_PATTERNS.get(r, [])
+            if any(p.search(clean_text) for p in pats):
+                # extra guard: devops requires explicit evidence, not just CI/CD mentions
+                if r == "devops":
+                    if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
+                        continue
+                found_roles.add(r)
+
+    return sorted(list(found_roles)), skills
+
+def norm_pipe(tokens: List[str]) -> str:
+    toks = [_norm_token(t) for t in tokens if _norm_token(t)]
+    uniq = sorted(set(toks))
+    return "|" + "|".join(uniq) + "|" if uniq else "|"
+
+def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
+    dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
+    lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
+    if not lines:
+        return None, None, 0.0, dbg
+
+    candidates: List[Tuple[int, str, bool, bool]] = []
+    for i, ln in enumerate(lines):
+        has_hint = SALARY_HINT_RE.search(ln) is not None
+        has_pay = PAY_TOKEN_RE.search(ln) is not None
+        if not has_hint and not has_pay:
+            continue
+        if SALARY_NOISE_RE.search(ln) and not has_hint:
+            continue
+        candidates.append((i, ln, has_hint, has_pay))
+
+    if not candidates:
+        return None, None, 0.0, dbg
+
+    has_hint = any(x[2] for x in candidates)
+    if not has_hint:
+        # Inline pay without "salary" is allowed only near header/contact block.
+        candidates = [x for x in candidates if x[0] < 15]
+        if not candidates:
+            return None, None, 0.0, dbg
+
+    scan_chunks: List[str] = []
+    for i, ln, hint, _ in candidates:
+        chunk = ln
+        if hint and (i + 1) < len(lines):
+            chunk = f"{chunk} {lines[i + 1]}"
+        scan_chunks.append(chunk)
+        dbg["used_lines"].append(ln)
+        if hint:
+            dbg["hint_lines"] += 1
+        dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
+
+    nums: List[int] = []
+    for chunk in scan_chunks:
+        for m in NUM_RE.finditer(chunk):
+            val = None
+            if m.group(1) and m.group(2):
+                val = int(m.group(1)) * 1000
+            elif m.group(3):
+                val = int(re.sub(r"\s+", "", m.group(3)))
+            elif m.group(4):
+                val = int(m.group(4))
+            if val and 20_000 <= val <= 30_000_000:
+                nums.append(val)
+                dbg["numbers"].append(val)
+
+    if not nums:
+        return None, None, 0.0, dbg
+
+    nums = sorted(nums)
+    salary_min = nums[0]
+    salary_max = nums[-1] if len(nums) > 1 else nums[0]
+
+    if dbg["hint_lines"] > 0:
+        conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
+    else:
+        conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
+
+    if salary_max > salary_min * 4:
+        conf -= 0.12
+    if len(nums) == 1:
+        conf -= 0.06
+
+    conf = max(0.0, min(conf, 0.9))
+    if conf < 0.45:
+        return None, None, conf, dbg
+    return salary_min, salary_max, conf, dbg
+
+def extract_location_best_effort(text: str) -> Optional[str]:
+    if not text:
+        return None
+
+    def _clean_loc(val: str) -> str:
+        return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
+
+    def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
+        v = _clean_loc(val)
+        if not v or len(v) < 3 or len(v) > 90:
+            return False
+        if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
+            return False
+        if SECTION_HEADER_RE.match(v):
+            return False
+        if LOCATION_CITY_COUNTRY_RE.match(v):
+            return True
+        if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
+            return True
+        return False
+
+    patterns = [
+        re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
+        re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
+        re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
+    ]
+    for p in patterns:
+        m = p.search(text)
+        if m:
+            val = _clean_loc(m.group(2))
+            if _is_loc_like(val, allow_single=True):
+                return val
+
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    head: List[str] = []
+    for ln in lines[:60]:
+        if SECTION_HEADER_RE.match(ln):
+            low = ln.lower()
+            if low in ("contacts", "contact", "contact info"):
+                continue
+            break
+        head.append(ln)
+
+    for ln in head:
+        parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
+        for seg in parts:
+            if _is_loc_like(seg):
+                return _clean_loc(seg)
+    return None
--- a/extract/pdf_extract.py
+++ b/extract/pdf_extract.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import re
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+try:  # optional dependency
+    from pypdf import PdfReader  # type: ignore
+except Exception:  # pragma: no cover
+    try:
+        from PyPDF2 import PdfReader  # type: ignore
+    except Exception:  # pragma: no cover
+        PdfReader = None  # type: ignore
+
+try:  # optional dependency
+    from pdfminer.high_level import extract_text as pdfminer_extract_text  # type: ignore
+except Exception:  # pragma: no cover
+    pdfminer_extract_text = None  # type: ignore
+
+
+@dataclass
+class PdfExtractResult:
+    text: str
+    pages: List[dict]
+    method: str
+    score: float
+    flags: List[str]
+
+
+_SECTION_HINTS = [
+    "experience", "work experience", "skills", "education", "projects", "summary", "about",
+    "опыт работы", "навыки", "образование", "проекты", "о себе",
+]
+
+
+def _which_pdftotext() -> Optional[str]:
+    exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
+    return exe
+
+
+def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
+    exe = _which_pdftotext()
+    if not exe:
+        return ""
+    cmd = [exe]
+    if layout:
+        cmd.append("-layout")
+    cmd += ["-nopgbrk", str(path), "-"]
+    try:
+        p = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=timeout_sec,
+            check=False,
+            text=True,
+            encoding="utf-8",
+            errors="ignore",
+        )
+        return (p.stdout or "").strip()
+    except Exception:
+        return ""
+
+
+def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
+    if PdfReader is None:
+        return []
+    try:
+        reader = PdfReader(str(path), strict=False)
+    except Exception:
+        return []
+    pages: List[dict] = []
+    for i, page in enumerate(getattr(reader, "pages", [])):
+        if max_pages and i >= max_pages:
+            break
+        try:
+            text = page.extract_text() or ""
+        except Exception:
+            text = ""
+        pages.append({"page": i + 1, "text": text})
+    return pages
+
+
+def _extract_pdfminer(path: Path) -> str:
+    if pdfminer_extract_text is None:
+        return ""
+    try:
+        return (pdfminer_extract_text(str(path)) or "").strip()
+    except Exception:
+        return ""
+
+
+def _quality_score(text: str) -> Tuple[float, List[str]]:
+    flags: List[str] = []
+    if not text:
+        return 0.0, ["empty"]
+
+    total = len(text)
+    letters = sum(ch.isalpha() for ch in text)
+    spaces = text.count(" ")
+    alpha_ratio = letters / max(1, total)
+    space_ratio = spaces / max(1, total)
+
+    words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
+    avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
+
+    lines = [ln for ln in text.splitlines() if ln.strip()]
+    long_lines = [ln for ln in lines if len(ln) > 200]
+    long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
+
+    glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
+
+    section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
+
+    score = 0.0
+    if alpha_ratio >= 0.45:
+        score += 2.0
+    elif alpha_ratio >= 0.30:
+        score += 1.0
+    else:
+        flags.append("low_alpha")
+
+    if 0.10 <= space_ratio <= 0.28:
+        score += 1.0
+    else:
+        flags.append("odd_spacing")
+
+    if 3.5 <= avg_word_len <= 9.0:
+        score += 1.0
+    else:
+        flags.append("odd_word_len")
+
+    if long_line_ratio <= 0.06:
+        score += 1.0
+    else:
+        flags.append("long_lines")
+
+    if glued_hits <= 6:
+        score += 1.0
+    else:
+        flags.append("glued_text")
+
+    if section_hits >= 2:
+        score += 1.0
+    elif section_hits == 1:
+        score += 0.5
+
+    if total < 200:
+        flags.append("short_text")
+
+    if alpha_ratio < 0.08 or total < 120:
+        flags.append("scan_like")
+
+    return score, flags
+
+
+def deglue_text(text: str) -> str:
+    if not text:
+        return text
+    t = text
+    t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
+    t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
+    t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
+    t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
+    return t
+
+
+def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
+    candidates: List[Tuple[str, str]] = []
+
+    txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
+    if txt_layout:
+        candidates.append(("pdftotext_layout", txt_layout))
+
+    txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
+    if txt_plain:
+        candidates.append(("pdftotext_plain", txt_plain))
+
+    txt_pypdf = ""
+    if PdfReader is not None:
+        pages = _extract_pages_pypdf(path)
+        if pages:
+            txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
+    if txt_pypdf:
+        candidates.append(("pypdf", txt_pypdf))
+
+    txt_pdfminer = _extract_pdfminer(path)
+    if txt_pdfminer:
+        candidates.append(("pdfminer", txt_pdfminer))
+
+    if not candidates:
+        return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
+
+    best_method = "none"
+    best_text = ""
+    best_score = -1.0
+    best_flags: List[str] = []
+    for method, text in candidates:
+        score, flags = _quality_score(text)
+        if score > best_score:
+            best_score = score
+            best_method = method
+            best_text = text
+            best_flags = flags
+
+    pages = _extract_pages_pypdf(path)
+    best_text = deglue_text(best_text)
+    return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)
--- a/extract/sections.py
+++ b/extract/sections.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+
+_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
+    "contacts": [
+        re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
+    ],
+    "about": [
+        re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
+    ],
+    "skills": [
+        re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
+    ],
+    "experience": [
+        re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
+    ],
+    "education": [
+        re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
+    ],
+    "projects": [
+        re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
+    ],
+    "languages": [
+        re.compile(r"^\s*(languages?|языки)\s*$", re.I),
+    ],
+    "certifications": [
+        re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
+    ],
+    "publications": [
+        re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
+    ],
+}
+
+
+def _match_header(line: str) -> Optional[str]:
+    for key, patterns in _SECTION_PATTERNS.items():
+        for rx in patterns:
+            if rx.match(line):
+                return key
+    return None
+
+
+def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
+    lines = [ln.strip() for ln in (clean_text or "").splitlines()]
+    sections: Dict[str, List[str]] = {"header": []}
+    current = "header"
+
+    for ln in lines:
+        if not ln:
+            continue
+        key = _match_header(ln)
+        if key:
+            current = key
+            sections.setdefault(current, [])
+            continue
+        sections.setdefault(current, []).append(ln)
+
+    out: Dict[str, str] = {}
+    for k, vals in sections.items():
+        text = "\n".join(vals).strip()
+        if text:
+            out[k] = text
+    return out
+
+
+def sections_present(sections: Dict[str, str]) -> List[str]:
+    return sorted([k for k, v in (sections or {}).items() if v and k != "header"])
--- a/extract/templates/init.py
+++ b/extract/templates/init.py
@@ -0,0 +1 @@
+__all__ = []
--- a/extract/templates/generic.py
+++ b/extract/templates/generic.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    text = clean_text or ""
+
+    contacts_raw = extract_contacts(text)
+    name = extract_name_guess(text)
+    remote = extract_remote(text)
+    english = extract_english(text)
+    roles, skills = extract_roles_skills(text)
+    location = extract_location_best_effort(text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "generic_heur",
+    }
--- a/extract/templates/hh.py
+++ b/extract/templates/hh.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
+    if not sections:
+        return fallback
+    return sections.get(key) or fallback
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    header_text = _pick(sections, "header", clean_text)
+    contacts_text = _pick(sections, "contacts", clean_text)
+    about_text = _pick(sections, "about", clean_text)
+    skills_text = _pick(sections, "skills", clean_text)
+    exp_text = _pick(sections, "experience", clean_text)
+    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
+
+    name = extract_name_guess(header_text)
+    contacts_raw = extract_contacts(contacts_text)
+    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
+
+    remote = extract_remote(clean_text)
+    english = extract_english(clean_text)
+    location = extract_location_best_effort(clean_text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "hh_template",
+    }
--- a/extract/templates/hh_ru.py
+++ b/extract/templates/hh_ru.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Optional
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+_DESIRED_RE = re.compile(r"(?i)жел[а-я]*\s+должност[ьи]\s*[:\-]?\s*(.+)")
+_SPEC_RE = re.compile(r"(?i)специализаци[яи]\s*[:\-]?\s*(.+)")
+_SCHEDULE_RE = re.compile(r"(?i)график\s+работы\s*[:\-]?\s*(.+)")
+_EMPLOYMENT_RE = re.compile(r"(?i)занятость\s*[:\-]?\s*(.+)")
+
+
+def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
+    if not sections:
+        return fallback
+    return sections.get(key) or fallback
+
+
+def _find_first(regex: re.Pattern, text: str) -> Optional[str]:
+    for ln in text.splitlines():
+        m = regex.search(ln)
+        if m:
+            val = m.group(1).strip()
+            val = re.split(r"[|;/]", val)[0].strip()
+            if 2 <= len(val) <= 80:
+                return val
+    return None
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    header_text = _pick(sections, "header", clean_text)
+    contacts_text = _pick(sections, "contacts", clean_text)
+    about_text = _pick(sections, "about", clean_text)
+    skills_text = _pick(sections, "skills", clean_text)
+    exp_text = _pick(sections, "experience", clean_text)
+    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
+
+    name = extract_name_guess(header_text)
+    contacts_raw = extract_contacts(contacts_text)
+    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
+
+    remote = extract_remote(clean_text)
+    english = extract_english(clean_text)
+    location = extract_location_best_effort(clean_text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
+
+    desired_title = _find_first(_DESIRED_RE, clean_text)
+    specializations = _find_first(_SPEC_RE, clean_text)
+    schedule = _find_first(_SCHEDULE_RE, clean_text)
+    employment = _find_first(_EMPLOYMENT_RE, clean_text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "desired_title": desired_title,
+        "specializations": specializations,
+        "employment_type": employment,
+        "schedule": schedule,
+        "parse_method": "hh_template",
+    }
--- a/extract/templates/linkedin.py
+++ b/extract/templates/linkedin.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
+    if not sections:
+        return fallback
+    return sections.get(key) or fallback
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    header_text = _pick(sections, "header", clean_text)
+    about_text = _pick(sections, "about", clean_text)
+    skills_text = _pick(sections, "skills", clean_text)
+    exp_text = _pick(sections, "experience", clean_text)
+    exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
+
+    name = extract_name_guess(header_text)
+    contacts_raw = extract_contacts(clean_text)
+    roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
+
+    remote = extract_remote(clean_text)
+    english = extract_english(clean_text)
+    location = extract_location_best_effort(clean_text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "linkedin_template",
+    }
--- a/extract/templates/one_page.py
+++ b/extract/templates/one_page.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    text = clean_text or ""
+
+    contacts_raw = extract_contacts(text)
+    name = extract_name_guess(text)
+    roles, skills = extract_roles_skills(text)
+    remote = extract_remote(text)
+    english = extract_english(text)
+    location = extract_location_best_effort(text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "one_page_template",
+    }
--- a/extract/templates/one_page_en.py
+++ b/extract/templates/one_page_en.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.templates.one_page import parse_resume as _parse
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    out = _parse(clean_text, sections)
+    out["parse_method"] = "one_page_en"
+    return out
--- a/extract/templates/one_page_ru.py
+++ b/extract/templates/one_page_ru.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.templates.one_page import parse_resume as _parse
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    out = _parse(clean_text, sections)
+    out["parse_method"] = "one_page_ru"
+    return out
--- a/extract/templates/pptx_export.py
+++ b/extract/templates/pptx_export.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from tg_resume_db.extract.parse import (
+    extract_contacts,
+    extract_name_guess,
+    extract_remote,
+    extract_english,
+    extract_roles_skills,
+    extract_salary,
+    extract_location_best_effort,
+    extract_experience_years,
+)
+
+
+def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
+    text = clean_text or ""
+    contacts_raw = extract_contacts(text)
+    name = extract_name_guess(text)
+    roles, skills = extract_roles_skills(text)
+    remote = extract_remote(text)
+    english = extract_english(text)
+    location = extract_location_best_effort(text)
+    exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
+    sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
+
+    return {
+        "name": name,
+        "contacts_raw": contacts_raw,
+        "remote": remote,
+        "english": english,
+        "roles": roles,
+        "skills": skills,
+        "location": location,
+        "exp_years": exp_years,
+        "exp_years_eng": exp_years_eng,
+        "exp_conf": exp_conf,
+        "exp_dbg": exp_dbg,
+        "salary_min": sal_min,
+        "salary_max": sal_max,
+        "salary_conf": sal_conf,
+        "salary_dbg": sal_dbg,
+        "parse_method": "pptx_template",
+    }
--- a/extract/text_extract.py
+++ b/extract/text_extract.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import logging
+from bs4 import BeautifulSoup
+
+try:  # optional dependency for PDF fallback
+    from pypdf import PdfReader as _PdfReader  # type: ignore
+except Exception:  # pragma: no cover - optional import
+    try:
+        from PyPDF2 import PdfReader as _PdfReader  # type: ignore
+    except Exception:  # pragma: no cover
+        _PdfReader = None  # type: ignore
+
+def _read_bytes(path: Path) -> bytes:
+    return path.read_bytes()
+
+def extract_text_from_txt(path: Path) -> str:
+    data = _read_bytes(path)
+    for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
+        try:
+            return data.decode(enc, errors="ignore")
+        except Exception:
+            continue
+    return data.decode("utf-8", errors="ignore")
+
+def extract_text_from_html(path: Path) -> str:
+    html = extract_text_from_txt(path)
+    soup = BeautifulSoup(html, "lxml")
+    return soup.get_text("\n", strip=True)
+
+def extract_text_from_docx(path: Path) -> str:
+    from docx import Document
+    doc = Document(str(path))
+    parts = []
+    for p in doc.paragraphs:
+        if p.text and p.text.strip():
+            parts.append(p.text.strip())
+    for table in doc.tables:
+        for row in table.rows:
+            cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
+            if cells:
+                parts.append(" | ".join(cells))
+    return "\n".join(parts)
+
+_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
+# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
+logging.getLogger("pypdf").setLevel(logging.ERROR)
+logging.getLogger("PyPDF2").setLevel(logging.ERROR)
+
+
+def extract_text_from_pdf(path: Path) -> str:
+    """
+    Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
+    Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
+    """
+    if _PdfReader is None:
+        raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
+
+    try:
+        reader = _PdfReader(str(path), strict=False)
+    except Exception as exc:  # pragma: no cover - pdf parser edge cases
+        raise RuntimeError(f"PDF read failed: {exc}") from exc
+
+    parts = []
+    for idx, page in enumerate(getattr(reader, "pages", [])):
+        if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
+            break
+        try:
+            text = page.extract_text()  # type: ignore[attr-defined]
+        except Exception:
+            text = None
+        if text:
+            parts.append(text)
+    return "\n".join(parts)
+
+def extract_text_from_doc_best_effort(path: Path) -> str:
+    # .doc requires external tools; best-effort if textract installed
+    try:
+        import textract  # type: ignore
+        b = textract.process(str(path))
+        return b.decode("utf-8", errors="ignore")
+    except Exception:
+        return ""
+
+def extract_text(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in (".txt", ".log"):
+        return extract_text_from_txt(path)
+    if ext in (".html", ".htm"):
+        return extract_text_from_html(path)
+    if ext == ".docx":
+        return extract_text_from_docx(path)
+    if ext == ".pdf":
+        return extract_text_from_pdf(path)
+    if ext == ".doc":
+        return extract_text_from_doc_best_effort(path)
+    return ""