Initial commit
This commit is contained in:
70
extract/sections.py
Normal file
70
extract/sections.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
|
||||
"contacts": [
|
||||
re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
|
||||
],
|
||||
"about": [
|
||||
re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
|
||||
],
|
||||
"skills": [
|
||||
re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
|
||||
],
|
||||
"experience": [
|
||||
re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
|
||||
],
|
||||
"education": [
|
||||
re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
|
||||
],
|
||||
"projects": [
|
||||
re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
|
||||
],
|
||||
"languages": [
|
||||
re.compile(r"^\s*(languages?|языки)\s*$", re.I),
|
||||
],
|
||||
"certifications": [
|
||||
re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
|
||||
],
|
||||
"publications": [
|
||||
re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _match_header(line: str) -> Optional[str]:
|
||||
for key, patterns in _SECTION_PATTERNS.items():
|
||||
for rx in patterns:
|
||||
if rx.match(line):
|
||||
return key
|
||||
return None
|
||||
|
||||
|
||||
def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
|
||||
lines = [ln.strip() for ln in (clean_text or "").splitlines()]
|
||||
sections: Dict[str, List[str]] = {"header": []}
|
||||
current = "header"
|
||||
|
||||
for ln in lines:
|
||||
if not ln:
|
||||
continue
|
||||
key = _match_header(ln)
|
||||
if key:
|
||||
current = key
|
||||
sections.setdefault(current, [])
|
||||
continue
|
||||
sections.setdefault(current, []).append(ln)
|
||||
|
||||
out: Dict[str, str] = {}
|
||||
for k, vals in sections.items():
|
||||
text = "\n".join(vals).strip()
|
||||
if text:
|
||||
out[k] = text
|
||||
return out
|
||||
|
||||
|
||||
def sections_present(sections: Dict[str, str]) -> List[str]:
|
||||
return sorted([k for k, v in (sections or {}).items() if v and k != "header"])
|
||||
Reference in New Issue
Block a user