Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

159
extract/experience.py Normal file
View File

@@ -0,0 +1,159 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from datetime import date
from typing import Dict, List, Optional, Tuple
# Month maps (EN + RU)
MONTHS = {
"jan": 1, "january": 1, "янв": 1, "январ": 1,
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
"mar": 3, "march": 3, "мар": 3, "март": 3,
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
"may": 5, "май": 5,
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
"jul": 7, "july": 7, "июл": 7, "июль": 7,
"aug": 8, "august": 8, "авг": 8, "август": 8,
"sep": 9, "september": 9, "сен": 9, "сент": 9,
"oct": 10, "october": 10, "окт": 10, "октя": 10,
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
"dec": 12, "december": 12, "дек": 12, "дека": 12,
}
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\\.|по настоящее)\b", re.I)
# Direct "X years" patterns
DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
# Dates like 03.2019, 2019, Jan 2020, янв 2020
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
# Range separators
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—||-|to|по)\s*(?P<b>.+?)$", re.I)
@dataclass
class ExpResult:
years: Optional[float]
confidence: float
debug: Dict
def _clamp_years(y: float) -> Optional[float]:
if 0.0 <= y <= 45.0:
return y
return None
def _parse_mon(mon: str) -> Optional[int]:
m = mon.strip().lower()
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
# allow prefixes: "январ", "феврал"
for k, v in MONTHS.items():
if m.startswith(k):
return v
return None
def _as_ymd(y: int, m: int) -> date:
return date(y, m, 1)
def _parse_one_date(s: str) -> Optional[date]:
s = s.strip()
if PRESENT_RE.search(s):
today = date.today()
return date(today.year, today.month, 1)
m1 = MMYYYY_RE.search(s)
if m1:
mm = int(m1.group(1))
yy = int(m1.group(2))
return _as_ymd(yy, mm)
m2 = MON_YYYY_RE.search(s)
if m2:
mon = _parse_mon(m2.group(1))
yy = int(m2.group(2))
if mon:
return _as_ymd(yy, mon)
m3 = YYYY_RE.search(s)
if m3:
yy = int(m3.group(1))
return _as_ymd(yy, 1)
return None
def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
if not intervals:
return []
intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
merged = [intervals[0]]
for s, e in intervals[1:]:
ls, le = merged[-1]
if s <= le:
merged[-1] = (ls, max(le, e))
else:
merged.append((s, e))
return merged
def _months_between(a: date, b: date) -> int:
# month-level difference (inclusive-ish): b >= a
return (b.year - a.year) * 12 + (b.month - a.month)
def extract_experience(text: str) -> ExpResult:
debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
# 1) Direct years
directs = []
for m in DIRECT_YEARS_RE.finditer(text):
try:
v = float(m.group(1).replace(",", "."))
if 0 <= v <= 45:
directs.append(v)
debug["direct_matches"].append({"match": m.group(0), "value": v})
except Exception:
pass
if directs:
years = _clamp_years(max(directs))
return ExpResult(years=years, confidence=0.90, debug=debug)
# 2) Ranges in lines: try to detect "start - end"
intervals: List[Tuple[date, date]] = []
for line in text.splitlines():
ln = line.strip()
if len(ln) < 7:
continue
# require range separator
if not any(x in ln for x in ("", "", "-", " to ", " по ")):
continue
rr = RANGE_RE.match(ln)
if not rr:
continue
a = rr.group("a")
b = rr.group("b")
da = _parse_one_date(a)
db = _parse_one_date(b)
if da and db:
if db < da:
da, db = db, da
# cap extremely old
if da.year < 1990:
continue
intervals.append((da, db))
debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
intervals = _merge_intervals(intervals)
debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
if not intervals:
return ExpResult(years=None, confidence=0.0, debug=debug)
total_months = 0
for s, e in intervals:
total_months += max(0, _months_between(s, e))
years = round(total_months / 12.0, 2)
years = _clamp_years(years) if years is not None else None
# confidence depends on amount of evidence
conf = 0.70 if total_months >= 12 else 0.55
return ExpResult(years=years, confidence=conf, debug=debug)