from __future__ import annotations import re from dataclasses import dataclass from datetime import date from typing import Dict, List, Optional, Tuple # Month maps (EN + RU) MONTHS = { "jan": 1, "january": 1, "янв": 1, "январ": 1, "feb": 2, "february": 2, "фев": 2, "феврал": 2, "mar": 3, "march": 3, "мар": 3, "март": 3, "apr": 4, "april": 4, "апр": 4, "апрел": 4, "may": 5, "май": 5, "jun": 6, "june": 6, "июн": 6, "июнь": 6, "jul": 7, "july": 7, "июл": 7, "июль": 7, "aug": 8, "august": 8, "авг": 8, "август": 8, "sep": 9, "september": 9, "сен": 9, "сент": 9, "oct": 10, "october": 10, "окт": 10, "октя": 10, "nov": 11, "november": 11, "ноя": 11, "ноябр": 11, "dec": 12, "december": 12, "дек": 12, "дека": 12, } PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I) # Direct "X years" patterns DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I) # Dates like 03.2019, 2019, Jan 2020, янв 2020 MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b") YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b") MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b") # Range separators RANGE_RE = re.compile(r"(?P.+?)\s*(?:—|–|-|to|по)\s*(?P.+?)$", re.I) @dataclass class ExpResult: years: Optional[float] confidence: float debug: Dict def _clamp_years(y: float) -> Optional[float]: if 0.0 <= y <= 45.0: return y return None def _parse_mon(mon: str) -> Optional[int]: m = mon.strip().lower() m = re.sub(r"[^\wа-я]+", "", m, flags=re.I) # allow prefixes: "январ", "феврал" for k, v in MONTHS.items(): if m.startswith(k): return v return None def _as_ymd(y: int, m: int) -> date: return date(y, m, 1) def _parse_one_date(s: str) -> Optional[date]: s = s.strip() if PRESENT_RE.search(s): today = date.today() return date(today.year, today.month, 1) m1 = MMYYYY_RE.search(s) if m1: mm = int(m1.group(1)) yy = int(m1.group(2)) return _as_ymd(yy, mm) m2 = MON_YYYY_RE.search(s) if m2: mon = _parse_mon(m2.group(1)) yy = int(m2.group(2)) if mon: return _as_ymd(yy, mon) m3 = YYYY_RE.search(s) if m3: yy = int(m3.group(1)) return _as_ymd(yy, 1) return None def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]: if not intervals: return [] intervals = sorted(intervals, key=lambda x: (x[0], x[1])) merged = [intervals[0]] for s, e in intervals[1:]: ls, le = merged[-1] if s <= le: merged[-1] = (ls, max(le, e)) else: merged.append((s, e)) return merged def _months_between(a: date, b: date) -> int: # month-level difference (inclusive-ish): b >= a return (b.year - a.year) * 12 + (b.month - a.month) def extract_experience(text: str) -> ExpResult: debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []} # 1) Direct years directs = [] for m in DIRECT_YEARS_RE.finditer(text): try: v = float(m.group(1).replace(",", ".")) if 0 <= v <= 45: directs.append(v) debug["direct_matches"].append({"match": m.group(0), "value": v}) except Exception: pass if directs: years = _clamp_years(max(directs)) return ExpResult(years=years, confidence=0.90, debug=debug) # 2) Ranges in lines: try to detect "start - end" intervals: List[Tuple[date, date]] = [] for line in text.splitlines(): ln = line.strip() if len(ln) < 7: continue # require range separator if not any(x in ln for x in ("—", "–", "-", " to ", " по ")): continue rr = RANGE_RE.match(ln) if not rr: continue a = rr.group("a") b = rr.group("b") da = _parse_one_date(a) db = _parse_one_date(b) if da and db: if db < da: da, db = db, da # cap extremely old if da.year < 1990: continue intervals.append((da, db)) debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()}) intervals = _merge_intervals(intervals) debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals] if not intervals: return ExpResult(years=None, confidence=0.0, debug=debug) total_months = 0 for s, e in intervals: total_months += max(0, _months_between(s, e)) years = round(total_months / 12.0, 2) years = _clamp_years(years) if years is not None else None # confidence depends on amount of evidence conf = 0.70 if total_months >= 12 else 0.55 return ExpResult(years=years, confidence=conf, debug=debug)