tg_resume_db/extract/sections.py

from __future__ import annotations

import re
from typing import Dict, List, Optional, Tuple


_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
    "contacts": [
        re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
    ],
    "about": [
        re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
    ],
    "skills": [
        re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
    ],
    "experience": [
        re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
    ],
    "education": [
        re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
    ],
    "projects": [
        re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
    ],
    "languages": [
        re.compile(r"^\s*(languages?|языки)\s*$", re.I),
    ],
    "certifications": [
        re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
    ],
    "publications": [
        re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
    ],
}


def _match_header(line: str) -> Optional[str]:
    for key, patterns in _SECTION_PATTERNS.items():
        for rx in patterns:
            if rx.match(line):
                return key
    return None


def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
    lines = [ln.strip() for ln in (clean_text or "").splitlines()]
    sections: Dict[str, List[str]] = {"header": []}
    current = "header"

    for ln in lines:
        if not ln:
            continue
        key = _match_header(ln)
        if key:
            current = key
            sections.setdefault(current, [])
            continue
        sections.setdefault(current, []).append(ln)

    out: Dict[str, str] = {}
    for k, vals in sections.items():
        text = "\n".join(vals).strip()
        if text:
            out[k] = text
    return out


def sections_present(sections: Dict[str, str]) -> List[str]:
    return sorted([k for k, v in (sections or {}).items() if v and k != "header"])