Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

70
extract/sections.py Normal file
View File

@@ -0,0 +1,70 @@
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
"contacts": [
re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
],
"about": [
re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
],
"skills": [
re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
],
"experience": [
re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
],
"education": [
re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
],
"projects": [
re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
],
"languages": [
re.compile(r"^\s*(languages?|языки)\s*$", re.I),
],
"certifications": [
re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
],
"publications": [
re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
],
}
def _match_header(line: str) -> Optional[str]:
for key, patterns in _SECTION_PATTERNS.items():
for rx in patterns:
if rx.match(line):
return key
return None
def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
lines = [ln.strip() for ln in (clean_text or "").splitlines()]
sections: Dict[str, List[str]] = {"header": []}
current = "header"
for ln in lines:
if not ln:
continue
key = _match_header(ln)
if key:
current = key
sections.setdefault(current, [])
continue
sections.setdefault(current, []).append(ln)
out: Dict[str, str] = {}
for k, vals in sections.items():
text = "\n".join(vals).strip()
if text:
out[k] = text
return out
def sections_present(sections: Dict[str, str]) -> List[str]:
return sorted([k for k, v in (sections or {}).items() if v and k != "header"])