Files
tg_resume_db/extract/parse.py
2026-03-11 15:27:10 +03:00

660 lines
24 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import json
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from tg_resume_db.normalize import normalize_skill
from tg_resume_db.extract.experience import extract_experience
EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
EMAIL_SPLIT_RE = re.compile(
r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
re.I,
)
PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
EN_TEXT_RE = re.compile(
r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
re.I,
)
EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
REMOTE_RE = re.compile(
r"\b("
r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
r")\b",
re.I,
)
# Salary (rough)
CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
SALARY_HINT_RE = re.compile(
r"\b("
r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
r")\b",
re.I,
)
PAY_TOKEN_RE = re.compile(
r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
re.I,
)
SALARY_NOISE_RE = re.compile(
r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
r"companies?|followers?|downloads?|clients?)\b",
re.I,
)
SECTION_HEADER_RE = re.compile(
r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
re.I,
)
LOCATION_CITY_COUNTRY_RE = re.compile(
r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
)
# --- SKILLS & ROLES ---
SKILLS = {
"python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
"sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
"aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
"pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
}
_SKILL_ALIASES: Dict[str, List[str]] = {
"javascript": ["java script", "java-script", "js"],
"typescript": ["type script", "type-script", "ts"],
"postgresql": ["postgres", "postgre sql", "postgre-sql"],
"graphql": ["graph ql"],
"grpc": ["g rpc"],
}
def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
patterns: List[Tuple[str, re.Pattern]] = []
for skill in sorted(SKILLS):
aliases = [skill] + _SKILL_ALIASES.get(skill, [])
for alias in aliases:
if skill == "java" and alias == "java":
# Do not match "java" inside "java script".
pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
else:
pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
patterns.append((skill, pat))
return patterns
_SKILL_PATTERNS = _build_skill_patterns()
ROLES = {
"backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
"mobile","android","ios","team lead","tech lead","architect"
}
_ROLE_ALIASES: Dict[str, List[str]] = {
"backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
"frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
"fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
"devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
"qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
"sre": ["sre", "site reliability"],
"data engineer": ["data engineer"],
"data scientist": ["data scientist"],
"ml engineer": ["ml engineer", "machine learning engineer"],
"mobile": ["mobile developer", "mobile engineer"],
"android": ["android developer", "android engineer"],
"ios": ["ios developer", "ios engineer"],
"team lead": ["team lead", "teamlead"],
"tech lead": ["tech lead", "techlead"],
"architect": ["architect", "solution architect", "software architect"],
}
def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
out: Dict[str, List[re.Pattern]] = {}
for role in ROLES:
aliases = _ROLE_ALIASES.get(role, [role])
out[role] = [
re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
for a in aliases
]
return out
_ROLE_PATTERNS = _build_role_patterns()
# --- HR / RECRUITER FILTERS ---
# Words that indicate the line is about searching for candidates, not owning the skill.
HR_CONTEXT_RE = re.compile(
r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
re.I
)
# Roles that explicitly define the person as Non-Engineering
NON_TECH_ROLES_RE = re.compile(
r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
re.I
)
# --- EXPERIENCE ---
AGE_LINE_RE = re.compile(
r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
)
EXP_HEADER_RE = re.compile(
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
)
# "5 years 10 months"
EXP_SUMMARY_RE = re.compile(
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
r"[^0-9]{0,20}"
r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
)
EXP_NEARBY_RE = re.compile(
r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
)
HH_FOOTER_RE = re.compile(
r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
re.I,
)
NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
NAME_LINE_RE = re.compile(
r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
)
NAME_STOPWORDS = {
"resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
"projects", "about", "profile", "objective", "навыки", "опыт", "образование",
"контакты", "профиль", "цель", "резюме",
"developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
"backend developer", "frontend developer", "fullstack developer", "software engineer",
"разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
"top skills", "experience", "education", "languages", "certifications",
"skills & endorsements", "endorsements",
"university", "state university", "institute", "college", "academy", "school",
"bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
"колледж", "школа", "бакалавр", "магистр", "факультет",
}
_NAME_BAD_WORDS = {
"skills", "top skills", "experience", "education", "languages", "certifications",
"projects", "summary", "about", "profile", "endorsements",
"university", "institute", "college", "academy", "school",
"bachelor", "master", "degree", "faculty",
}
NAME_INSTITUTION_RE = re.compile(
r"\b("
r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
r"mathematics|computer science|informatics|physics|economics|management|"
r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
r"математик|информатик|физик|экономик|менеджмент"
r")\b",
re.I,
)
_EMAIL_PREFIX_STOP = {
"email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
}
def _prune_fragment_emails(values: List[str]) -> List[str]:
uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
out: List[str] = []
for e in uniq:
local, domain = e.split("@", 1)
drop = False
for other in uniq:
if other == e:
continue
ol, od = other.split("@", 1)
if od != domain:
continue
if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
drop = True
break
if not drop:
out.append(e)
return out
def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
"""
Returns (total_years, engineering_years, confidence, debug).
Logic:
1. Calculate TOTAL experience from summaries.
2. Check if the candidate is primarily a Recruiter/HR.
- If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
- If NO: engineering_years = total_years (Optimistic assumption for valid devs).
"""
dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
total_years: Optional[float] = None
confidence = 0.0
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
# 1. Detect if Recruiter
# Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
header_text = "\n".join(lines[:15])
is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
dbg["is_recruiter"] = is_recruiter
# 2. Extract Total Duration
if lines:
# Strategy A: Explicit summary
for i, ln in enumerate(lines[:200]):
if AGE_LINE_RE.search(ln): continue
# Look for summary line
if EXP_HEADER_RE.search(ln):
window = ln
if i + 1 < len(lines): window += " " + lines[i+1]
if i + 2 < len(lines): window += " " + lines[i+2]
m = EXP_SUMMARY_RE.search(window)
if m:
y = int(m.group("y"))
mm = int(m.group("m")) if m.group("m") else 0
total_years = float(round(y + (mm / 12.0), 2))
if 0 <= total_years <= 60:
dbg["method"] = "summary"
dbg["matched"] = m.group(0)
confidence = 0.95
break
# Strategy B: Fallback nearby
if total_years is None:
safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
for i, ln in enumerate(safe_lines):
if not EXP_HEADER_RE.search(ln): continue
chunk = " ".join(safe_lines[i : i + 12])
m = EXP_NEARBY_RE.search(chunk)
if m:
y = int(m.group("y"))
mm = int(m.group("m")) if m.group("m") else 0
val = float(round(y + (mm / 12.0), 2))
if 0 <= val <= 60:
total_years = val
dbg["method"] = "header_chunk"
dbg["matched"] = m.group(0)
confidence = 0.80
break
# 2.5 Timeline/range fallback-reconciliation
# Protects against cases where summary parser catches one short fragment
# while CV has a long timeline.
try:
alt = extract_experience(text or "")
except Exception:
alt = None
if alt and alt.years is not None:
if total_years is None:
total_years = alt.years
confidence = max(confidence, alt.confidence)
dbg["method"] = "timeline_fallback"
dbg["matched"] = "date_ranges"
elif alt.years > (total_years + 1.0):
strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
if strong_summary and (alt.years - float(total_years)) > 1.5:
dbg["reconcile"] = "timeline_skip_strong_summary"
else:
total_years = alt.years
confidence = max(confidence, min(0.82, alt.confidence))
dbg["method"] = "timeline_reconcile"
dbg["matched"] = "date_ranges"
# 3. Calculate Engineering Years
eng_years = total_years
if is_recruiter:
# If they are a recruiter, their "engineering" experience is effectively 0
# for the purpose of finding a Developer.
eng_years = 0.0
return total_years, eng_years, confidence, dbg
def _norm_phone(p: str) -> str:
digits = re.sub(r"\D+", "", p)
if digits.startswith("8") and len(digits) == 11:
digits = "7" + digits[1:]
return "+" + digits if digits else ""
def _norm_token(s: str) -> str:
return re.sub(r"\s+", " ", s.strip().lower())
def safe_json(v) -> str:
return json.dumps(v, ensure_ascii=False)
def extract_contacts(text: str) -> Dict[str, List[str]]:
emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
for m in EMAIL_SPLIT_RE.finditer(text or ""):
prefix = m.group("prefix").strip().lower().strip(".-_")
if not prefix or prefix in _EMAIL_PREFIX_STOP:
continue
if not re.search(r"[._\-\d]", prefix):
continue
tail = m.group("tail").lower()
if "@" not in tail:
continue
local_tail, domain = tail.split("@", 1)
local = f"{prefix}{local_tail}"
if len(local) > 64:
continue
cand = f"{local}@{domain}"
if EMAIL_RE.fullmatch(cand):
emails_set.add(cand)
emails = _prune_fragment_emails(sorted(emails_set))
phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
def extract_name_guess(text: str) -> Optional[str]:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if not lines:
return None
# 1) HH footer "Name • Резюме обновлено ..."
m = HH_FOOTER_RE.search(text or "")
if m:
cand = m.group("name").strip()
if _looks_like_name_line(cand):
return cand
# 2) Key-value line: "Name: ..." / "Имя: ..."
for ln in lines[:40]:
m2 = NAME_KV_RE.match(ln)
if m2:
cand = m2.group(2).strip()
cand = re.split(r"[|,/;]", cand)[0].strip()
if _looks_like_name_line(cand):
return cand
# 3) Name-like in first ~40 lines
for ln in lines[:40]:
if _looks_like_heading_line(ln):
continue
if _looks_like_name_line(ln):
return ln
# 4) Name-like near the end (pptx exports often put name there)
tail_start = max(0, len(lines) - 60)
for i in range(tail_start, len(lines)):
ln = lines[i]
if _looks_like_heading_line(ln):
continue
ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
if NAME_INSTITUTION_RE.search(ctx):
continue
if _looks_like_name_line(ln):
return ln
return None
def _looks_like_heading_line(line: str) -> bool:
low = (line or "").strip().lower()
if not low:
return False
if low in _NAME_BAD_WORDS:
return True
if low.startswith("top skills"):
return True
if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
return True
return False
def _looks_like_name_line(line: str) -> bool:
if not line:
return False
if len(line) > 80:
return False
low = line.lower().strip()
if low in NAME_STOPWORDS:
return False
if _looks_like_heading_line(line):
return False
if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
return False
if NAME_INSTITUTION_RE.search(line):
return False
if not NAME_LINE_RE.match(line.strip()):
return False
return True
def extract_remote(text: str) -> Optional[bool]:
if not text:
return None
for ln in text.splitlines()[:120]:
if REMOTE_RE.search(ln):
return True
return None
def extract_english(text: str) -> Optional[str]:
t = text or ""
lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
# 1) CEFR levels anywhere are accepted.
m = EN_RE.search(t)
if m:
return m.group(1).replace("+", "").upper()
# 2) Textual levels only when English context is present.
candidate_chunks: List[str] = []
for i, ln in enumerate(lines):
if EN_LANG_RE.search(ln):
candidate_chunks.append(ln)
if i + 1 < len(lines):
candidate_chunks.append(lines[i + 1])
if not candidate_chunks:
return None
m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
if not m2:
return None
word = m2.group(1).lower()
if word in ("native", "fluent", "proficient", "advanced"):
return "C1"
if word.startswith("upper"):
return "B2"
if word == "intermediate":
return "B1"
if word == "elementary":
return "A2"
return None
def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
"""
Extracts roles and skills, but strictly filters out HR/Recruitment context.
"""
lines = text.splitlines()
# 1. Filter text: Remove lines that talk about hiring/vacancies
clean_lines = []
for ln in lines:
if not HR_CONTEXT_RE.search(ln):
clean_lines.append(ln)
clean_text = "\n".join(clean_lines).lower()
# 2. Extract Skills from clean text only
skills = []
for s, pat in _SKILL_PATTERNS:
if pat.search(clean_text):
skills.append(normalize_skill(s) or s)
skills = sorted(set(skills))
# 3. Extract Roles
# Priority: Header (first 10 lines)
header_text = "\n".join(lines[:10]).lower()
found_roles = set()
# Check if Recruiter
if NON_TECH_ROLES_RE.search(header_text):
# If explicit recruiter in header, do NOT add generic tech roles like "backend"
# even if they appear in the text (often describes who they hire).
pass
else:
# Normal extraction
for r in ROLES:
pats = _ROLE_PATTERNS.get(r, [])
if any(p.search(clean_text) for p in pats):
# extra guard: devops requires explicit evidence, not just CI/CD mentions
if r == "devops":
if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
continue
found_roles.add(r)
return sorted(list(found_roles)), skills
def norm_pipe(tokens: List[str]) -> str:
toks = [_norm_token(t) for t in tokens if _norm_token(t)]
uniq = sorted(set(toks))
return "|" + "|".join(uniq) + "|" if uniq else "|"
def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
if not lines:
return None, None, 0.0, dbg
candidates: List[Tuple[int, str, bool, bool]] = []
for i, ln in enumerate(lines):
has_hint = SALARY_HINT_RE.search(ln) is not None
has_pay = PAY_TOKEN_RE.search(ln) is not None
if not has_hint and not has_pay:
continue
if SALARY_NOISE_RE.search(ln) and not has_hint:
continue
candidates.append((i, ln, has_hint, has_pay))
if not candidates:
return None, None, 0.0, dbg
has_hint = any(x[2] for x in candidates)
if not has_hint:
# Inline pay without "salary" is allowed only near header/contact block.
candidates = [x for x in candidates if x[0] < 15]
if not candidates:
return None, None, 0.0, dbg
scan_chunks: List[str] = []
for i, ln, hint, _ in candidates:
chunk = ln
if hint and (i + 1) < len(lines):
chunk = f"{chunk} {lines[i + 1]}"
scan_chunks.append(chunk)
dbg["used_lines"].append(ln)
if hint:
dbg["hint_lines"] += 1
dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
nums: List[int] = []
for chunk in scan_chunks:
for m in NUM_RE.finditer(chunk):
val = None
if m.group(1) and m.group(2):
val = int(m.group(1)) * 1000
elif m.group(3):
val = int(re.sub(r"\s+", "", m.group(3)))
elif m.group(4):
val = int(m.group(4))
if val and 20_000 <= val <= 30_000_000:
nums.append(val)
dbg["numbers"].append(val)
if not nums:
return None, None, 0.0, dbg
nums = sorted(nums)
salary_min = nums[0]
salary_max = nums[-1] if len(nums) > 1 else nums[0]
if dbg["hint_lines"] > 0:
conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
else:
conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
if salary_max > salary_min * 4:
conf -= 0.12
if len(nums) == 1:
conf -= 0.06
conf = max(0.0, min(conf, 0.9))
if conf < 0.45:
return None, None, conf, dbg
return salary_min, salary_max, conf, dbg
def extract_location_best_effort(text: str) -> Optional[str]:
if not text:
return None
def _clean_loc(val: str) -> str:
return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
v = _clean_loc(val)
if not v or len(v) < 3 or len(v) > 90:
return False
if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
return False
if SECTION_HEADER_RE.match(v):
return False
if LOCATION_CITY_COUNTRY_RE.match(v):
return True
if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
return True
return False
patterns = [
re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
]
for p in patterns:
m = p.search(text)
if m:
val = _clean_loc(m.group(2))
if _is_loc_like(val, allow_single=True):
return val
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
head: List[str] = []
for ln in lines[:60]:
if SECTION_HEADER_RE.match(ln):
low = ln.lower()
if low in ("contacts", "contact", "contact info"):
continue
break
head.append(ln)
for ln in head:
parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
for seg in parts:
if _is_loc_like(seg):
return _clean_loc(seg)
return None