from __future__ import annotations import re from dataclasses import dataclass, asdict from datetime import date from typing import List, Optional MONTHS = { "jan": 1, "january": 1, "янв": 1, "январ": 1, "feb": 2, "february": 2, "фев": 2, "феврал": 2, "mar": 3, "march": 3, "мар": 3, "март": 3, "apr": 4, "april": 4, "апр": 4, "апрел": 4, "may": 5, "май": 5, "jun": 6, "june": 6, "июн": 6, "июнь": 6, "jul": 7, "july": 7, "июл": 7, "июль": 7, "aug": 8, "august": 8, "авг": 8, "август": 8, "sep": 9, "september": 9, "сен": 9, "сент": 9, "oct": 10, "october": 10, "окт": 10, "октя": 10, "nov": 11, "november": 11, "ноя": 11, "ноябр": 11, "dec": 12, "december": 12, "дек": 12, "дека": 12, } PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I) MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b") YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b") MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b") RANGE_RE = re.compile(r"(?P.+?)\s*(?:—|–|-|to|по)\s*(?P.+?)$", re.I) YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—|–|-|to|по)\s*\d{4}\s*$", re.I) EDU_CONTEXT_RE = re.compile( r"\b(" r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|" r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет" r")\b", re.I, ) @dataclass class Position: title: Optional[str] company: Optional[str] date_from: Optional[str] date_to: Optional[str] is_current: Optional[bool] description: Optional[str] def _parse_mon(mon: str) -> Optional[int]: m = mon.strip().lower() m = re.sub(r"[^\wа-я]+", "", m, flags=re.I) for k, v in MONTHS.items(): if m.startswith(k): return v return None def _as_ymd(y: int, m: int) -> date: return date(y, m, 1) def _parse_one_date(s: str) -> Optional[date]: s = s.strip() if PRESENT_RE.search(s): today = date.today() return date(today.year, today.month, 1) m1 = MMYYYY_RE.search(s) if m1: mm = int(m1.group(1)) yy = int(m1.group(2)) return _as_ymd(yy, mm) m2 = MON_YYYY_RE.search(s) if m2: mon = _parse_mon(m2.group(1)) yy = int(m2.group(2)) if mon: return _as_ymd(yy, mon) m3 = YYYY_RE.search(s) if m3: yy = int(m3.group(1)) return _as_ymd(yy, 1) return None def extract_positions(text: str, max_items: int = 40) -> List[Position]: lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] positions: List[Position] = [] i = 0 while i < len(lines) and len(positions) < max_items: ln = lines[i] if not any(x in ln for x in ("—", "–", "-", " to ", " по ")): i += 1 continue rr = RANGE_RE.match(ln) if not rr: i += 1 continue ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)]) if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx): i += 1 continue da = _parse_one_date(rr.group("a")) db = _parse_one_date(rr.group("b")) if not da or not db: i += 1 continue if da.year < 1990: i += 1 continue is_current = PRESENT_RE.search(rr.group("b")) is not None title = None company = None desc_lines: List[str] = [] if i + 1 < len(lines): if EDU_CONTEXT_RE.search(lines[i + 1]): i += 1 continue header = lines[i + 1] parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()] if parts: title = parts[0] if len(parts) > 1: company = parts[1] j = i + 2 while j < len(lines): if any(x in lines[j] for x in ("—", "–", "-", " to ", " по ")) and RANGE_RE.match(lines[j]): break desc_lines.append(lines[j]) j += 1 positions.append( Position( title=title, company=company, date_from=da.isoformat(), date_to=db.isoformat(), is_current=is_current, description="\n".join(desc_lines).strip() if desc_lines else None, ) ) i = j return positions def positions_to_dicts(items: List[Position]) -> List[dict]: return [asdict(p) for p in items]