160 lines
5.0 KiB
Python
160 lines
5.0 KiB
Python
from __future__ import annotations
|
||
|
||
import re
|
||
from dataclasses import dataclass
|
||
from datetime import date
|
||
from typing import Dict, List, Optional, Tuple
|
||
|
||
# Month maps (EN + RU)
|
||
MONTHS = {
|
||
"jan": 1, "january": 1, "янв": 1, "январ": 1,
|
||
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
|
||
"mar": 3, "march": 3, "мар": 3, "март": 3,
|
||
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
|
||
"may": 5, "май": 5,
|
||
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
|
||
"jul": 7, "july": 7, "июл": 7, "июль": 7,
|
||
"aug": 8, "august": 8, "авг": 8, "август": 8,
|
||
"sep": 9, "september": 9, "сен": 9, "сент": 9,
|
||
"oct": 10, "october": 10, "окт": 10, "октя": 10,
|
||
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
|
||
"dec": 12, "december": 12, "дек": 12, "дека": 12,
|
||
}
|
||
|
||
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
|
||
|
||
# Direct "X years" patterns
|
||
DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
|
||
|
||
# Dates like 03.2019, 2019, Jan 2020, янв 2020
|
||
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
|
||
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
|
||
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
|
||
|
||
# Range separators
|
||
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
|
||
|
||
@dataclass
|
||
class ExpResult:
|
||
years: Optional[float]
|
||
confidence: float
|
||
debug: Dict
|
||
|
||
def _clamp_years(y: float) -> Optional[float]:
|
||
if 0.0 <= y <= 45.0:
|
||
return y
|
||
return None
|
||
|
||
def _parse_mon(mon: str) -> Optional[int]:
|
||
m = mon.strip().lower()
|
||
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
|
||
# allow prefixes: "январ", "феврал"
|
||
for k, v in MONTHS.items():
|
||
if m.startswith(k):
|
||
return v
|
||
return None
|
||
|
||
def _as_ymd(y: int, m: int) -> date:
|
||
return date(y, m, 1)
|
||
|
||
def _parse_one_date(s: str) -> Optional[date]:
|
||
s = s.strip()
|
||
if PRESENT_RE.search(s):
|
||
today = date.today()
|
||
return date(today.year, today.month, 1)
|
||
|
||
m1 = MMYYYY_RE.search(s)
|
||
if m1:
|
||
mm = int(m1.group(1))
|
||
yy = int(m1.group(2))
|
||
return _as_ymd(yy, mm)
|
||
|
||
m2 = MON_YYYY_RE.search(s)
|
||
if m2:
|
||
mon = _parse_mon(m2.group(1))
|
||
yy = int(m2.group(2))
|
||
if mon:
|
||
return _as_ymd(yy, mon)
|
||
|
||
m3 = YYYY_RE.search(s)
|
||
if m3:
|
||
yy = int(m3.group(1))
|
||
return _as_ymd(yy, 1)
|
||
|
||
return None
|
||
|
||
def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
|
||
if not intervals:
|
||
return []
|
||
intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
|
||
merged = [intervals[0]]
|
||
for s, e in intervals[1:]:
|
||
ls, le = merged[-1]
|
||
if s <= le:
|
||
merged[-1] = (ls, max(le, e))
|
||
else:
|
||
merged.append((s, e))
|
||
return merged
|
||
|
||
def _months_between(a: date, b: date) -> int:
|
||
# month-level difference (inclusive-ish): b >= a
|
||
return (b.year - a.year) * 12 + (b.month - a.month)
|
||
|
||
def extract_experience(text: str) -> ExpResult:
|
||
debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
|
||
|
||
# 1) Direct years
|
||
directs = []
|
||
for m in DIRECT_YEARS_RE.finditer(text):
|
||
try:
|
||
v = float(m.group(1).replace(",", "."))
|
||
if 0 <= v <= 45:
|
||
directs.append(v)
|
||
debug["direct_matches"].append({"match": m.group(0), "value": v})
|
||
except Exception:
|
||
pass
|
||
if directs:
|
||
years = _clamp_years(max(directs))
|
||
return ExpResult(years=years, confidence=0.90, debug=debug)
|
||
|
||
# 2) Ranges in lines: try to detect "start - end"
|
||
intervals: List[Tuple[date, date]] = []
|
||
for line in text.splitlines():
|
||
ln = line.strip()
|
||
if len(ln) < 7:
|
||
continue
|
||
# require range separator
|
||
if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
|
||
continue
|
||
rr = RANGE_RE.match(ln)
|
||
if not rr:
|
||
continue
|
||
a = rr.group("a")
|
||
b = rr.group("b")
|
||
da = _parse_one_date(a)
|
||
db = _parse_one_date(b)
|
||
if da and db:
|
||
if db < da:
|
||
da, db = db, da
|
||
# cap extremely old
|
||
if da.year < 1990:
|
||
continue
|
||
intervals.append((da, db))
|
||
debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
|
||
|
||
intervals = _merge_intervals(intervals)
|
||
debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
|
||
|
||
if not intervals:
|
||
return ExpResult(years=None, confidence=0.0, debug=debug)
|
||
|
||
total_months = 0
|
||
for s, e in intervals:
|
||
total_months += max(0, _months_between(s, e))
|
||
years = round(total_months / 12.0, 2)
|
||
years = _clamp_years(years) if years is not None else None
|
||
|
||
# confidence depends on amount of evidence
|
||
conf = 0.70 if total_months >= 12 else 0.55
|
||
return ExpResult(years=years, confidence=conf, debug=debug)
|