from __future__ import annotations import json import re from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple from tg_resume_db.normalize import normalize_skill from tg_resume_db.extract.experience import extract_experience EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I) EMAIL_SPLIT_RE = re.compile( r"(?[a-z0-9][a-z0-9._%+\-]{1,40})\s+" r"(?P[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})", re.I, ) PHONE_RE = re.compile(r"(? List[Tuple[str, re.Pattern]]: patterns: List[Tuple[str, re.Pattern]] = [] for skill in sorted(SKILLS): aliases = [skill] + _SKILL_ALIASES.get(skill, []) for alias in aliases: if skill == "java" and alias == "java": # Do not match "java" inside "java script". pat = re.compile(r"(? Dict[str, List[re.Pattern]]: out: Dict[str, List[re.Pattern]] = {} for role in ROLES: aliases = _ROLE_ALIASES.get(role, [role]) out[role] = [ re.compile(r"(?\d{1,2})\s*(?:год|года|лет|years?|yrs?)" r"(?:[^0-9]{0,20}(?P\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?" ) EXP_NEARBY_RE = re.compile( r"(?i)\b(?P\d{1,2})\s*(?:год|года|лет|years?|yrs?)" r"(?:[^0-9]{0,20}(?P\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?" ) HH_FOOTER_RE = re.compile( r"(?P[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено", re.I, ) NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I) NAME_LINE_RE = re.compile( r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$" ) NAME_STOPWORDS = { "resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education", "projects", "about", "profile", "objective", "навыки", "опыт", "образование", "контакты", "профиль", "цель", "резюме", "developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead", "backend developer", "frontend developer", "fullstack developer", "software engineer", "разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид", "top skills", "experience", "education", "languages", "certifications", "skills & endorsements", "endorsements", "university", "state university", "institute", "college", "academy", "school", "bachelor", "master", "degree", "faculty", "университет", "институт", "академия", "колледж", "школа", "бакалавр", "магистр", "факультет", } _NAME_BAD_WORDS = { "skills", "top skills", "experience", "education", "languages", "certifications", "projects", "summary", "about", "profile", "endorsements", "university", "institute", "college", "academy", "school", "bachelor", "master", "degree", "faculty", } NAME_INSTITUTION_RE = re.compile( r"\b(" r"university|institute|college|academy|school|faculty|bachelor|master|degree|" r"mathematics|computer science|informatics|physics|economics|management|" r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|" r"математик|информатик|физик|экономик|менеджмент" r")\b", re.I, ) _EMAIL_PREFIX_STOP = { "email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github", } def _prune_fragment_emails(values: List[str]) -> List[str]: uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v)) out: List[str] = [] for e in uniq: local, domain = e.split("@", 1) drop = False for other in uniq: if other == e: continue ol, od = other.split("@", 1) if od != domain: continue if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol): drop = True break if not drop: out.append(e) return out def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]: """ Returns (total_years, engineering_years, confidence, debug). Logic: 1. Calculate TOTAL experience from summaries. 2. Check if the candidate is primarily a Recruiter/HR. - If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs). - If NO: engineering_years = total_years (Optimistic assumption for valid devs). """ dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False} total_years: Optional[float] = None confidence = 0.0 lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] # 1. Detect if Recruiter # Check the "Header" (first ~15 lines or first 1000 chars) for HR titles header_text = "\n".join(lines[:15]) is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text)) dbg["is_recruiter"] = is_recruiter # 2. Extract Total Duration if lines: # Strategy A: Explicit summary for i, ln in enumerate(lines[:200]): if AGE_LINE_RE.search(ln): continue # Look for summary line if EXP_HEADER_RE.search(ln): window = ln if i + 1 < len(lines): window += " " + lines[i+1] if i + 2 < len(lines): window += " " + lines[i+2] m = EXP_SUMMARY_RE.search(window) if m: y = int(m.group("y")) mm = int(m.group("m")) if m.group("m") else 0 total_years = float(round(y + (mm / 12.0), 2)) if 0 <= total_years <= 60: dbg["method"] = "summary" dbg["matched"] = m.group(0) confidence = 0.95 break # Strategy B: Fallback nearby if total_years is None: safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)] for i, ln in enumerate(safe_lines): if not EXP_HEADER_RE.search(ln): continue chunk = " ".join(safe_lines[i : i + 12]) m = EXP_NEARBY_RE.search(chunk) if m: y = int(m.group("y")) mm = int(m.group("m")) if m.group("m") else 0 val = float(round(y + (mm / 12.0), 2)) if 0 <= val <= 60: total_years = val dbg["method"] = "header_chunk" dbg["matched"] = m.group(0) confidence = 0.80 break # 2.5 Timeline/range fallback-reconciliation # Protects against cases where summary parser catches one short fragment # while CV has a long timeline. try: alt = extract_experience(text or "") except Exception: alt = None if alt and alt.years is not None: if total_years is None: total_years = alt.years confidence = max(confidence, alt.confidence) dbg["method"] = "timeline_fallback" dbg["matched"] = "date_ranges" elif alt.years > (total_years + 1.0): strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78 if strong_summary and (alt.years - float(total_years)) > 1.5: dbg["reconcile"] = "timeline_skip_strong_summary" else: total_years = alt.years confidence = max(confidence, min(0.82, alt.confidence)) dbg["method"] = "timeline_reconcile" dbg["matched"] = "date_ranges" # 3. Calculate Engineering Years eng_years = total_years if is_recruiter: # If they are a recruiter, their "engineering" experience is effectively 0 # for the purpose of finding a Developer. eng_years = 0.0 return total_years, eng_years, confidence, dbg def _norm_phone(p: str) -> str: digits = re.sub(r"\D+", "", p) if digits.startswith("8") and len(digits) == 11: digits = "7" + digits[1:] return "+" + digits if digits else "" def _norm_token(s: str) -> str: return re.sub(r"\s+", " ", s.strip().lower()) def safe_json(v) -> str: return json.dumps(v, ensure_ascii=False) def extract_contacts(text: str) -> Dict[str, List[str]]: emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or "")) for m in EMAIL_SPLIT_RE.finditer(text or ""): prefix = m.group("prefix").strip().lower().strip(".-_") if not prefix or prefix in _EMAIL_PREFIX_STOP: continue if not re.search(r"[._\-\d]", prefix): continue tail = m.group("tail").lower() if "@" not in tail: continue local_tail, domain = tail.split("@", 1) local = f"{prefix}{local_tail}" if len(local) > 64: continue cand = f"{local}@{domain}" if EMAIL_RE.fullmatch(cand): emails_set.add(cand) emails = _prune_fragment_emails(sorted(emails_set)) phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1)))) tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text))) gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text))) li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text))) urls = sorted(set(m.group(0) for m in URL_RE.finditer(text))) return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls} def extract_name_guess(text: str) -> Optional[str]: lines = [ln.strip() for ln in text.splitlines() if ln.strip()] if not lines: return None # 1) HH footer "Name • Резюме обновлено ..." m = HH_FOOTER_RE.search(text or "") if m: cand = m.group("name").strip() if _looks_like_name_line(cand): return cand # 2) Key-value line: "Name: ..." / "Имя: ..." for ln in lines[:40]: m2 = NAME_KV_RE.match(ln) if m2: cand = m2.group(2).strip() cand = re.split(r"[|,/;]", cand)[0].strip() if _looks_like_name_line(cand): return cand # 3) Name-like in first ~40 lines for ln in lines[:40]: if _looks_like_heading_line(ln): continue if _looks_like_name_line(ln): return ln # 4) Name-like near the end (pptx exports often put name there) tail_start = max(0, len(lines) - 60) for i in range(tail_start, len(lines)): ln = lines[i] if _looks_like_heading_line(ln): continue ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower() if NAME_INSTITUTION_RE.search(ctx): continue if _looks_like_name_line(ln): return ln return None def _looks_like_heading_line(line: str) -> bool: low = (line or "").strip().lower() if not low: return False if low in _NAME_BAD_WORDS: return True if low.startswith("top skills"): return True if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")): return True return False def _looks_like_name_line(line: str) -> bool: if not line: return False if len(line) > 80: return False low = line.lower().strip() if low in NAME_STOPWORDS: return False if _looks_like_heading_line(line): return False if re.search(r"\b(resume|cv|резюме)\b", line, re.I): return False if NAME_INSTITUTION_RE.search(line): return False if not NAME_LINE_RE.match(line.strip()): return False return True def extract_remote(text: str) -> Optional[bool]: if not text: return None for ln in text.splitlines()[:120]: if REMOTE_RE.search(ln): return True return None def extract_english(text: str) -> Optional[str]: t = text or "" lines = [ln.strip() for ln in t.splitlines() if ln.strip()] # 1) CEFR levels anywhere are accepted. m = EN_RE.search(t) if m: return m.group(1).replace("+", "").upper() # 2) Textual levels only when English context is present. candidate_chunks: List[str] = [] for i, ln in enumerate(lines): if EN_LANG_RE.search(ln): candidate_chunks.append(ln) if i + 1 < len(lines): candidate_chunks.append(lines[i + 1]) if not candidate_chunks: return None m2 = EN_TEXT_RE.search("\n".join(candidate_chunks)) if not m2: return None word = m2.group(1).lower() if word in ("native", "fluent", "proficient", "advanced"): return "C1" if word.startswith("upper"): return "B2" if word == "intermediate": return "B1" if word == "elementary": return "A2" return None def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]: """ Extracts roles and skills, but strictly filters out HR/Recruitment context. """ lines = text.splitlines() # 1. Filter text: Remove lines that talk about hiring/vacancies clean_lines = [] for ln in lines: if not HR_CONTEXT_RE.search(ln): clean_lines.append(ln) clean_text = "\n".join(clean_lines).lower() # 2. Extract Skills from clean text only skills = [] for s, pat in _SKILL_PATTERNS: if pat.search(clean_text): skills.append(normalize_skill(s) or s) skills = sorted(set(skills)) # 3. Extract Roles # Priority: Header (first 10 lines) header_text = "\n".join(lines[:10]).lower() found_roles = set() # Check if Recruiter if NON_TECH_ROLES_RE.search(header_text): # If explicit recruiter in header, do NOT add generic tech roles like "backend" # even if they appear in the text (often describes who they hire). pass else: # Normal extraction for r in ROLES: pats = _ROLE_PATTERNS.get(r, []) if any(p.search(clean_text) for p in pats): # extra guard: devops requires explicit evidence, not just CI/CD mentions if r == "devops": if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I): continue found_roles.add(r) return sorted(list(found_roles)), skills def norm_pipe(tokens: List[str]) -> str: toks = [_norm_token(t) for t in tokens if _norm_token(t)] uniq = sorted(set(toks)) return "|" + "|".join(uniq) + "|" if uniq else "|" def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]: dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []} lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] if not lines: return None, None, 0.0, dbg candidates: List[Tuple[int, str, bool, bool]] = [] for i, ln in enumerate(lines): has_hint = SALARY_HINT_RE.search(ln) is not None has_pay = PAY_TOKEN_RE.search(ln) is not None if not has_hint and not has_pay: continue if SALARY_NOISE_RE.search(ln) and not has_hint: continue candidates.append((i, ln, has_hint, has_pay)) if not candidates: return None, None, 0.0, dbg has_hint = any(x[2] for x in candidates) if not has_hint: # Inline pay without "salary" is allowed only near header/contact block. candidates = [x for x in candidates if x[0] < 15] if not candidates: return None, None, 0.0, dbg scan_chunks: List[str] = [] for i, ln, hint, _ in candidates: chunk = ln if hint and (i + 1) < len(lines): chunk = f"{chunk} {lines[i + 1]}" scan_chunks.append(chunk) dbg["used_lines"].append(ln) if hint: dbg["hint_lines"] += 1 dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk)) nums: List[int] = [] for chunk in scan_chunks: for m in NUM_RE.finditer(chunk): val = None if m.group(1) and m.group(2): val = int(m.group(1)) * 1000 elif m.group(3): val = int(re.sub(r"\s+", "", m.group(3))) elif m.group(4): val = int(m.group(4)) if val and 20_000 <= val <= 30_000_000: nums.append(val) dbg["numbers"].append(val) if not nums: return None, None, 0.0, dbg nums = sorted(nums) salary_min = nums[0] salary_max = nums[-1] if len(nums) > 1 else nums[0] if dbg["hint_lines"] > 0: conf = 0.82 if dbg["currency_hits"] > 0 else 0.70 else: conf = 0.58 if dbg["currency_hits"] > 0 else 0.0 if salary_max > salary_min * 4: conf -= 0.12 if len(nums) == 1: conf -= 0.06 conf = max(0.0, min(conf, 0.9)) if conf < 0.45: return None, None, conf, dbg return salary_min, salary_max, conf, dbg def extract_location_best_effort(text: str) -> Optional[str]: if not text: return None def _clean_loc(val: str) -> str: return re.sub(r"\s+", " ", (val or "").strip(" |,;")) def _is_loc_like(val: str, *, allow_single: bool = False) -> bool: v = _clean_loc(val) if not v or len(v) < 3 or len(v) > 90: return False if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v): return False if SECTION_HEADER_RE.match(v): return False if LOCATION_CITY_COUNTRY_RE.match(v): return True if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v): return True return False patterns = [ re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"), re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"), re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"), ] for p in patterns: m = p.search(text) if m: val = _clean_loc(m.group(2)) if _is_loc_like(val, allow_single=True): return val lines = [ln.strip() for ln in text.splitlines() if ln.strip()] head: List[str] = [] for ln in lines[:60]: if SECTION_HEADER_RE.match(ln): low = ln.lower() if low in ("contacts", "contact", "contact info"): continue break head.append(ln) for ln in head: parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()] for seg in parts: if _is_loc_like(seg): return _clean_loc(seg) return None