Initial commit
This commit is contained in:
159
extract/experience.py
Normal file
159
extract/experience.py
Normal file
@@ -0,0 +1,159 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
# Month maps (EN + RU)
|
||||
MONTHS = {
|
||||
"jan": 1, "january": 1, "янв": 1, "январ": 1,
|
||||
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
|
||||
"mar": 3, "march": 3, "мар": 3, "март": 3,
|
||||
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
|
||||
"may": 5, "май": 5,
|
||||
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
|
||||
"jul": 7, "july": 7, "июл": 7, "июль": 7,
|
||||
"aug": 8, "august": 8, "авг": 8, "август": 8,
|
||||
"sep": 9, "september": 9, "сен": 9, "сент": 9,
|
||||
"oct": 10, "october": 10, "окт": 10, "октя": 10,
|
||||
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
|
||||
"dec": 12, "december": 12, "дек": 12, "дека": 12,
|
||||
}
|
||||
|
||||
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
|
||||
|
||||
# Direct "X years" patterns
|
||||
DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
|
||||
|
||||
# Dates like 03.2019, 2019, Jan 2020, янв 2020
|
||||
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
|
||||
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
|
||||
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
|
||||
|
||||
# Range separators
|
||||
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
|
||||
|
||||
@dataclass
|
||||
class ExpResult:
|
||||
years: Optional[float]
|
||||
confidence: float
|
||||
debug: Dict
|
||||
|
||||
def _clamp_years(y: float) -> Optional[float]:
|
||||
if 0.0 <= y <= 45.0:
|
||||
return y
|
||||
return None
|
||||
|
||||
def _parse_mon(mon: str) -> Optional[int]:
|
||||
m = mon.strip().lower()
|
||||
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
|
||||
# allow prefixes: "январ", "феврал"
|
||||
for k, v in MONTHS.items():
|
||||
if m.startswith(k):
|
||||
return v
|
||||
return None
|
||||
|
||||
def _as_ymd(y: int, m: int) -> date:
|
||||
return date(y, m, 1)
|
||||
|
||||
def _parse_one_date(s: str) -> Optional[date]:
|
||||
s = s.strip()
|
||||
if PRESENT_RE.search(s):
|
||||
today = date.today()
|
||||
return date(today.year, today.month, 1)
|
||||
|
||||
m1 = MMYYYY_RE.search(s)
|
||||
if m1:
|
||||
mm = int(m1.group(1))
|
||||
yy = int(m1.group(2))
|
||||
return _as_ymd(yy, mm)
|
||||
|
||||
m2 = MON_YYYY_RE.search(s)
|
||||
if m2:
|
||||
mon = _parse_mon(m2.group(1))
|
||||
yy = int(m2.group(2))
|
||||
if mon:
|
||||
return _as_ymd(yy, mon)
|
||||
|
||||
m3 = YYYY_RE.search(s)
|
||||
if m3:
|
||||
yy = int(m3.group(1))
|
||||
return _as_ymd(yy, 1)
|
||||
|
||||
return None
|
||||
|
||||
def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
|
||||
if not intervals:
|
||||
return []
|
||||
intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
|
||||
merged = [intervals[0]]
|
||||
for s, e in intervals[1:]:
|
||||
ls, le = merged[-1]
|
||||
if s <= le:
|
||||
merged[-1] = (ls, max(le, e))
|
||||
else:
|
||||
merged.append((s, e))
|
||||
return merged
|
||||
|
||||
def _months_between(a: date, b: date) -> int:
|
||||
# month-level difference (inclusive-ish): b >= a
|
||||
return (b.year - a.year) * 12 + (b.month - a.month)
|
||||
|
||||
def extract_experience(text: str) -> ExpResult:
|
||||
debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
|
||||
|
||||
# 1) Direct years
|
||||
directs = []
|
||||
for m in DIRECT_YEARS_RE.finditer(text):
|
||||
try:
|
||||
v = float(m.group(1).replace(",", "."))
|
||||
if 0 <= v <= 45:
|
||||
directs.append(v)
|
||||
debug["direct_matches"].append({"match": m.group(0), "value": v})
|
||||
except Exception:
|
||||
pass
|
||||
if directs:
|
||||
years = _clamp_years(max(directs))
|
||||
return ExpResult(years=years, confidence=0.90, debug=debug)
|
||||
|
||||
# 2) Ranges in lines: try to detect "start - end"
|
||||
intervals: List[Tuple[date, date]] = []
|
||||
for line in text.splitlines():
|
||||
ln = line.strip()
|
||||
if len(ln) < 7:
|
||||
continue
|
||||
# require range separator
|
||||
if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
|
||||
continue
|
||||
rr = RANGE_RE.match(ln)
|
||||
if not rr:
|
||||
continue
|
||||
a = rr.group("a")
|
||||
b = rr.group("b")
|
||||
da = _parse_one_date(a)
|
||||
db = _parse_one_date(b)
|
||||
if da and db:
|
||||
if db < da:
|
||||
da, db = db, da
|
||||
# cap extremely old
|
||||
if da.year < 1990:
|
||||
continue
|
||||
intervals.append((da, db))
|
||||
debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
|
||||
|
||||
intervals = _merge_intervals(intervals)
|
||||
debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
|
||||
|
||||
if not intervals:
|
||||
return ExpResult(years=None, confidence=0.0, debug=debug)
|
||||
|
||||
total_months = 0
|
||||
for s, e in intervals:
|
||||
total_months += max(0, _months_between(s, e))
|
||||
years = round(total_months / 12.0, 2)
|
||||
years = _clamp_years(years) if years is not None else None
|
||||
|
||||
# confidence depends on amount of evidence
|
||||
conf = 0.70 if total_months >= 12 else 0.55
|
||||
return ExpResult(years=years, confidence=conf, debug=debug)
|
||||
Reference in New Issue
Block a user