Initial commit
This commit is contained in:
144
extract/experience_timeline.py
Normal file
144
extract/experience_timeline.py
Normal file
@@ -0,0 +1,144 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import date
|
||||
from typing import List, Optional
|
||||
|
||||
MONTHS = {
|
||||
"jan": 1, "january": 1, "янв": 1, "январ": 1,
|
||||
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
|
||||
"mar": 3, "march": 3, "мар": 3, "март": 3,
|
||||
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
|
||||
"may": 5, "май": 5,
|
||||
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
|
||||
"jul": 7, "july": 7, "июл": 7, "июль": 7,
|
||||
"aug": 8, "august": 8, "авг": 8, "август": 8,
|
||||
"sep": 9, "september": 9, "сен": 9, "сент": 9,
|
||||
"oct": 10, "october": 10, "окт": 10, "октя": 10,
|
||||
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
|
||||
"dec": 12, "december": 12, "дек": 12, "дека": 12,
|
||||
}
|
||||
|
||||
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
|
||||
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
|
||||
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
|
||||
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
|
||||
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
|
||||
YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—|–|-|to|по)\s*\d{4}\s*$", re.I)
|
||||
EDU_CONTEXT_RE = re.compile(
|
||||
r"\b("
|
||||
r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
|
||||
r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
|
||||
r")\b",
|
||||
re.I,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Position:
|
||||
title: Optional[str]
|
||||
company: Optional[str]
|
||||
date_from: Optional[str]
|
||||
date_to: Optional[str]
|
||||
is_current: Optional[bool]
|
||||
description: Optional[str]
|
||||
|
||||
|
||||
def _parse_mon(mon: str) -> Optional[int]:
|
||||
m = mon.strip().lower()
|
||||
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
|
||||
for k, v in MONTHS.items():
|
||||
if m.startswith(k):
|
||||
return v
|
||||
return None
|
||||
|
||||
|
||||
def _as_ymd(y: int, m: int) -> date:
|
||||
return date(y, m, 1)
|
||||
|
||||
|
||||
def _parse_one_date(s: str) -> Optional[date]:
|
||||
s = s.strip()
|
||||
if PRESENT_RE.search(s):
|
||||
today = date.today()
|
||||
return date(today.year, today.month, 1)
|
||||
m1 = MMYYYY_RE.search(s)
|
||||
if m1:
|
||||
mm = int(m1.group(1))
|
||||
yy = int(m1.group(2))
|
||||
return _as_ymd(yy, mm)
|
||||
m2 = MON_YYYY_RE.search(s)
|
||||
if m2:
|
||||
mon = _parse_mon(m2.group(1))
|
||||
yy = int(m2.group(2))
|
||||
if mon:
|
||||
return _as_ymd(yy, mon)
|
||||
m3 = YYYY_RE.search(s)
|
||||
if m3:
|
||||
yy = int(m3.group(1))
|
||||
return _as_ymd(yy, 1)
|
||||
return None
|
||||
|
||||
|
||||
def extract_positions(text: str, max_items: int = 40) -> List[Position]:
|
||||
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||||
positions: List[Position] = []
|
||||
i = 0
|
||||
while i < len(lines) and len(positions) < max_items:
|
||||
ln = lines[i]
|
||||
if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
|
||||
i += 1
|
||||
continue
|
||||
rr = RANGE_RE.match(ln)
|
||||
if not rr:
|
||||
i += 1
|
||||
continue
|
||||
ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
|
||||
if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
|
||||
i += 1
|
||||
continue
|
||||
da = _parse_one_date(rr.group("a"))
|
||||
db = _parse_one_date(rr.group("b"))
|
||||
if not da or not db:
|
||||
i += 1
|
||||
continue
|
||||
if da.year < 1990:
|
||||
i += 1
|
||||
continue
|
||||
is_current = PRESENT_RE.search(rr.group("b")) is not None
|
||||
title = None
|
||||
company = None
|
||||
desc_lines: List[str] = []
|
||||
if i + 1 < len(lines):
|
||||
if EDU_CONTEXT_RE.search(lines[i + 1]):
|
||||
i += 1
|
||||
continue
|
||||
header = lines[i + 1]
|
||||
parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
|
||||
if parts:
|
||||
title = parts[0]
|
||||
if len(parts) > 1:
|
||||
company = parts[1]
|
||||
j = i + 2
|
||||
while j < len(lines):
|
||||
if any(x in lines[j] for x in ("—", "–", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
|
||||
break
|
||||
desc_lines.append(lines[j])
|
||||
j += 1
|
||||
positions.append(
|
||||
Position(
|
||||
title=title,
|
||||
company=company,
|
||||
date_from=da.isoformat(),
|
||||
date_to=db.isoformat(),
|
||||
is_current=is_current,
|
||||
description="\n".join(desc_lines).strip() if desc_lines else None,
|
||||
)
|
||||
)
|
||||
i = j
|
||||
return positions
|
||||
|
||||
|
||||
def positions_to_dicts(items: List[Position]) -> List[dict]:
|
||||
return [asdict(p) for p in items]
|
||||
Reference in New Issue
Block a user