Files
tg_resume_db/extract/experience.py
2026-03-11 15:27:10 +03:00

160 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from dataclasses import dataclass
from datetime import date
from typing import Dict, List, Optional, Tuple
# Month maps (EN + RU)
MONTHS = {
"jan": 1, "january": 1, "янв": 1, "январ": 1,
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
"mar": 3, "march": 3, "мар": 3, "март": 3,
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
"may": 5, "май": 5,
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
"jul": 7, "july": 7, "июл": 7, "июль": 7,
"aug": 8, "august": 8, "авг": 8, "август": 8,
"sep": 9, "september": 9, "сен": 9, "сент": 9,
"oct": 10, "october": 10, "окт": 10, "октя": 10,
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
"dec": 12, "december": 12, "дек": 12, "дека": 12,
}
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\\.|по настоящее)\b", re.I)
# Direct "X years" patterns
DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
# Dates like 03.2019, 2019, Jan 2020, янв 2020
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
# Range separators
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—||-|to|по)\s*(?P<b>.+?)$", re.I)
@dataclass
class ExpResult:
years: Optional[float]
confidence: float
debug: Dict
def _clamp_years(y: float) -> Optional[float]:
if 0.0 <= y <= 45.0:
return y
return None
def _parse_mon(mon: str) -> Optional[int]:
m = mon.strip().lower()
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
# allow prefixes: "январ", "феврал"
for k, v in MONTHS.items():
if m.startswith(k):
return v
return None
def _as_ymd(y: int, m: int) -> date:
return date(y, m, 1)
def _parse_one_date(s: str) -> Optional[date]:
s = s.strip()
if PRESENT_RE.search(s):
today = date.today()
return date(today.year, today.month, 1)
m1 = MMYYYY_RE.search(s)
if m1:
mm = int(m1.group(1))
yy = int(m1.group(2))
return _as_ymd(yy, mm)
m2 = MON_YYYY_RE.search(s)
if m2:
mon = _parse_mon(m2.group(1))
yy = int(m2.group(2))
if mon:
return _as_ymd(yy, mon)
m3 = YYYY_RE.search(s)
if m3:
yy = int(m3.group(1))
return _as_ymd(yy, 1)
return None
def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
if not intervals:
return []
intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
merged = [intervals[0]]
for s, e in intervals[1:]:
ls, le = merged[-1]
if s <= le:
merged[-1] = (ls, max(le, e))
else:
merged.append((s, e))
return merged
def _months_between(a: date, b: date) -> int:
# month-level difference (inclusive-ish): b >= a
return (b.year - a.year) * 12 + (b.month - a.month)
def extract_experience(text: str) -> ExpResult:
debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
# 1) Direct years
directs = []
for m in DIRECT_YEARS_RE.finditer(text):
try:
v = float(m.group(1).replace(",", "."))
if 0 <= v <= 45:
directs.append(v)
debug["direct_matches"].append({"match": m.group(0), "value": v})
except Exception:
pass
if directs:
years = _clamp_years(max(directs))
return ExpResult(years=years, confidence=0.90, debug=debug)
# 2) Ranges in lines: try to detect "start - end"
intervals: List[Tuple[date, date]] = []
for line in text.splitlines():
ln = line.strip()
if len(ln) < 7:
continue
# require range separator
if not any(x in ln for x in ("", "", "-", " to ", " по ")):
continue
rr = RANGE_RE.match(ln)
if not rr:
continue
a = rr.group("a")
b = rr.group("b")
da = _parse_one_date(a)
db = _parse_one_date(b)
if da and db:
if db < da:
da, db = db, da
# cap extremely old
if da.year < 1990:
continue
intervals.append((da, db))
debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
intervals = _merge_intervals(intervals)
debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
if not intervals:
return ExpResult(years=None, confidence=0.0, debug=debug)
total_months = 0
for s, e in intervals:
total_months += max(0, _months_between(s, e))
years = round(total_months / 12.0, 2)
years = _clamp_years(years) if years is not None else None
# confidence depends on amount of evidence
conf = 0.70 if total_months >= 12 else 0.55
return ExpResult(years=years, confidence=conf, debug=debug)