Files
tg_resume_db/extract/sections.py
2026-03-11 15:27:10 +03:00

71 lines
2.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
"contacts": [
re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
],
"about": [
re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
],
"skills": [
re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
],
"experience": [
re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
],
"education": [
re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
],
"projects": [
re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
],
"languages": [
re.compile(r"^\s*(languages?|языки)\s*$", re.I),
],
"certifications": [
re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
],
"publications": [
re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
],
}
def _match_header(line: str) -> Optional[str]:
for key, patterns in _SECTION_PATTERNS.items():
for rx in patterns:
if rx.match(line):
return key
return None
def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
lines = [ln.strip() for ln in (clean_text or "").splitlines()]
sections: Dict[str, List[str]] = {"header": []}
current = "header"
for ln in lines:
if not ln:
continue
key = _match_header(ln)
if key:
current = key
sections.setdefault(current, [])
continue
sections.setdefault(current, []).append(ln)
out: Dict[str, str] = {}
for k, vals in sections.items():
text = "\n".join(vals).strip()
if text:
out[k] = text
return out
def sections_present(sections: Dict[str, str]) -> List[str]:
return sorted([k for k, v in (sections or {}).items() if v and k != "header"])