Files
tg_resume_db/extract/experience_timeline.py
2026-03-11 15:27:10 +03:00

145 lines
4.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
from dataclasses import dataclass, asdict
from datetime import date
from typing import List, Optional
MONTHS = {
"jan": 1, "january": 1, "янв": 1, "январ": 1,
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
"mar": 3, "march": 3, "мар": 3, "март": 3,
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
"may": 5, "май": 5,
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
"jul": 7, "july": 7, "июл": 7, "июль": 7,
"aug": 8, "august": 8, "авг": 8, "август": 8,
"sep": 9, "september": 9, "сен": 9, "сент": 9,
"oct": 10, "october": 10, "окт": 10, "октя": 10,
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
"dec": 12, "december": 12, "дек": 12, "дека": 12,
}
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\\.|по настоящее)\b", re.I)
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—||-|to|по)\s*(?P<b>.+?)$", re.I)
YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—||-|to|по)\s*\d{4}\s*$", re.I)
EDU_CONTEXT_RE = re.compile(
r"\b("
r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
r")\b",
re.I,
)
@dataclass
class Position:
title: Optional[str]
company: Optional[str]
date_from: Optional[str]
date_to: Optional[str]
is_current: Optional[bool]
description: Optional[str]
def _parse_mon(mon: str) -> Optional[int]:
m = mon.strip().lower()
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
for k, v in MONTHS.items():
if m.startswith(k):
return v
return None
def _as_ymd(y: int, m: int) -> date:
return date(y, m, 1)
def _parse_one_date(s: str) -> Optional[date]:
s = s.strip()
if PRESENT_RE.search(s):
today = date.today()
return date(today.year, today.month, 1)
m1 = MMYYYY_RE.search(s)
if m1:
mm = int(m1.group(1))
yy = int(m1.group(2))
return _as_ymd(yy, mm)
m2 = MON_YYYY_RE.search(s)
if m2:
mon = _parse_mon(m2.group(1))
yy = int(m2.group(2))
if mon:
return _as_ymd(yy, mon)
m3 = YYYY_RE.search(s)
if m3:
yy = int(m3.group(1))
return _as_ymd(yy, 1)
return None
def extract_positions(text: str, max_items: int = 40) -> List[Position]:
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
positions: List[Position] = []
i = 0
while i < len(lines) and len(positions) < max_items:
ln = lines[i]
if not any(x in ln for x in ("", "", "-", " to ", " по ")):
i += 1
continue
rr = RANGE_RE.match(ln)
if not rr:
i += 1
continue
ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
i += 1
continue
da = _parse_one_date(rr.group("a"))
db = _parse_one_date(rr.group("b"))
if not da or not db:
i += 1
continue
if da.year < 1990:
i += 1
continue
is_current = PRESENT_RE.search(rr.group("b")) is not None
title = None
company = None
desc_lines: List[str] = []
if i + 1 < len(lines):
if EDU_CONTEXT_RE.search(lines[i + 1]):
i += 1
continue
header = lines[i + 1]
parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
if parts:
title = parts[0]
if len(parts) > 1:
company = parts[1]
j = i + 2
while j < len(lines):
if any(x in lines[j] for x in ("", "", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
break
desc_lines.append(lines[j])
j += 1
positions.append(
Position(
title=title,
company=company,
date_from=da.isoformat(),
date_to=db.isoformat(),
is_current=is_current,
description="\n".join(desc_lines).strip() if desc_lines else None,
)
)
i = j
return positions
def positions_to_dicts(items: List[Position]) -> List[dict]:
return [asdict(p) for p in items]