Initial commit
This commit is contained in:
659
extract/parse.py
Normal file
659
extract/parse.py
Normal file
@@ -0,0 +1,659 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from tg_resume_db.normalize import normalize_skill
|
||||
from tg_resume_db.extract.experience import extract_experience
|
||||
|
||||
EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
|
||||
EMAIL_SPLIT_RE = re.compile(
|
||||
r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
|
||||
r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
|
||||
re.I,
|
||||
)
|
||||
PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
|
||||
TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
|
||||
GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
|
||||
LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
|
||||
URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
|
||||
|
||||
EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
|
||||
EN_TEXT_RE = re.compile(
|
||||
r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
|
||||
re.I,
|
||||
)
|
||||
EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
|
||||
|
||||
REMOTE_RE = re.compile(
|
||||
r"\b("
|
||||
r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
|
||||
r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
|
||||
r")\b",
|
||||
re.I,
|
||||
)
|
||||
|
||||
# Salary (rough)
|
||||
CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
|
||||
NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
|
||||
SALARY_HINT_RE = re.compile(
|
||||
r"\b("
|
||||
r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
|
||||
r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
|
||||
r")\b",
|
||||
re.I,
|
||||
)
|
||||
PAY_TOKEN_RE = re.compile(
|
||||
r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
|
||||
re.I,
|
||||
)
|
||||
SALARY_NOISE_RE = re.compile(
|
||||
r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
|
||||
r"companies?|followers?|downloads?|clients?)\b",
|
||||
re.I,
|
||||
)
|
||||
|
||||
SECTION_HEADER_RE = re.compile(
|
||||
r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
|
||||
re.I,
|
||||
)
|
||||
LOCATION_CITY_COUNTRY_RE = re.compile(
|
||||
r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
|
||||
)
|
||||
|
||||
# --- SKILLS & ROLES ---
|
||||
|
||||
SKILLS = {
|
||||
"python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
|
||||
"sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
|
||||
"aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
|
||||
"pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
|
||||
}
|
||||
|
||||
_SKILL_ALIASES: Dict[str, List[str]] = {
|
||||
"javascript": ["java script", "java-script", "js"],
|
||||
"typescript": ["type script", "type-script", "ts"],
|
||||
"postgresql": ["postgres", "postgre sql", "postgre-sql"],
|
||||
"graphql": ["graph ql"],
|
||||
"grpc": ["g rpc"],
|
||||
}
|
||||
|
||||
|
||||
def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
|
||||
patterns: List[Tuple[str, re.Pattern]] = []
|
||||
for skill in sorted(SKILLS):
|
||||
aliases = [skill] + _SKILL_ALIASES.get(skill, [])
|
||||
for alias in aliases:
|
||||
if skill == "java" and alias == "java":
|
||||
# Do not match "java" inside "java script".
|
||||
pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
|
||||
else:
|
||||
pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
|
||||
patterns.append((skill, pat))
|
||||
return patterns
|
||||
|
||||
|
||||
_SKILL_PATTERNS = _build_skill_patterns()
|
||||
|
||||
ROLES = {
|
||||
"backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
|
||||
"mobile","android","ios","team lead","tech lead","architect"
|
||||
}
|
||||
|
||||
_ROLE_ALIASES: Dict[str, List[str]] = {
|
||||
"backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
|
||||
"frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
|
||||
"fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
|
||||
"devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
|
||||
"qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
|
||||
"sre": ["sre", "site reliability"],
|
||||
"data engineer": ["data engineer"],
|
||||
"data scientist": ["data scientist"],
|
||||
"ml engineer": ["ml engineer", "machine learning engineer"],
|
||||
"mobile": ["mobile developer", "mobile engineer"],
|
||||
"android": ["android developer", "android engineer"],
|
||||
"ios": ["ios developer", "ios engineer"],
|
||||
"team lead": ["team lead", "teamlead"],
|
||||
"tech lead": ["tech lead", "techlead"],
|
||||
"architect": ["architect", "solution architect", "software architect"],
|
||||
}
|
||||
|
||||
|
||||
def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
|
||||
out: Dict[str, List[re.Pattern]] = {}
|
||||
for role in ROLES:
|
||||
aliases = _ROLE_ALIASES.get(role, [role])
|
||||
out[role] = [
|
||||
re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
|
||||
for a in aliases
|
||||
]
|
||||
return out
|
||||
|
||||
|
||||
_ROLE_PATTERNS = _build_role_patterns()
|
||||
|
||||
# --- HR / RECRUITER FILTERS ---
|
||||
# Words that indicate the line is about searching for candidates, not owning the skill.
|
||||
HR_CONTEXT_RE = re.compile(
|
||||
r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
|
||||
r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
|
||||
re.I
|
||||
)
|
||||
|
||||
# Roles that explicitly define the person as Non-Engineering
|
||||
NON_TECH_ROLES_RE = re.compile(
|
||||
r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
|
||||
re.I
|
||||
)
|
||||
|
||||
# --- EXPERIENCE ---
|
||||
|
||||
AGE_LINE_RE = re.compile(
|
||||
r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
|
||||
)
|
||||
|
||||
EXP_HEADER_RE = re.compile(
|
||||
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
|
||||
)
|
||||
|
||||
# "5 years 10 months"
|
||||
EXP_SUMMARY_RE = re.compile(
|
||||
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
|
||||
r"[^0-9]{0,20}"
|
||||
r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
|
||||
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
|
||||
)
|
||||
|
||||
EXP_NEARBY_RE = re.compile(
|
||||
r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
|
||||
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
|
||||
)
|
||||
|
||||
HH_FOOTER_RE = re.compile(
|
||||
r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
|
||||
re.I,
|
||||
)
|
||||
NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
|
||||
NAME_LINE_RE = re.compile(
|
||||
r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
|
||||
)
|
||||
NAME_STOPWORDS = {
|
||||
"resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
|
||||
"projects", "about", "profile", "objective", "навыки", "опыт", "образование",
|
||||
"контакты", "профиль", "цель", "резюме",
|
||||
"developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
|
||||
"backend developer", "frontend developer", "fullstack developer", "software engineer",
|
||||
"разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
|
||||
"top skills", "experience", "education", "languages", "certifications",
|
||||
"skills & endorsements", "endorsements",
|
||||
"university", "state university", "institute", "college", "academy", "school",
|
||||
"bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
|
||||
"колледж", "школа", "бакалавр", "магистр", "факультет",
|
||||
}
|
||||
|
||||
_NAME_BAD_WORDS = {
|
||||
"skills", "top skills", "experience", "education", "languages", "certifications",
|
||||
"projects", "summary", "about", "profile", "endorsements",
|
||||
"university", "institute", "college", "academy", "school",
|
||||
"bachelor", "master", "degree", "faculty",
|
||||
}
|
||||
|
||||
NAME_INSTITUTION_RE = re.compile(
|
||||
r"\b("
|
||||
r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
|
||||
r"mathematics|computer science|informatics|physics|economics|management|"
|
||||
r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
|
||||
r"математик|информатик|физик|экономик|менеджмент"
|
||||
r")\b",
|
||||
re.I,
|
||||
)
|
||||
|
||||
_EMAIL_PREFIX_STOP = {
|
||||
"email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
|
||||
}
|
||||
|
||||
|
||||
def _prune_fragment_emails(values: List[str]) -> List[str]:
|
||||
uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
|
||||
out: List[str] = []
|
||||
for e in uniq:
|
||||
local, domain = e.split("@", 1)
|
||||
drop = False
|
||||
for other in uniq:
|
||||
if other == e:
|
||||
continue
|
||||
ol, od = other.split("@", 1)
|
||||
if od != domain:
|
||||
continue
|
||||
if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
|
||||
drop = True
|
||||
break
|
||||
if not drop:
|
||||
out.append(e)
|
||||
return out
|
||||
|
||||
|
||||
def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
|
||||
"""
|
||||
Returns (total_years, engineering_years, confidence, debug).
|
||||
|
||||
Logic:
|
||||
1. Calculate TOTAL experience from summaries.
|
||||
2. Check if the candidate is primarily a Recruiter/HR.
|
||||
- If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
|
||||
- If NO: engineering_years = total_years (Optimistic assumption for valid devs).
|
||||
"""
|
||||
dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
|
||||
|
||||
total_years: Optional[float] = None
|
||||
confidence = 0.0
|
||||
|
||||
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||||
|
||||
# 1. Detect if Recruiter
|
||||
# Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
|
||||
header_text = "\n".join(lines[:15])
|
||||
is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
|
||||
dbg["is_recruiter"] = is_recruiter
|
||||
|
||||
# 2. Extract Total Duration
|
||||
if lines:
|
||||
# Strategy A: Explicit summary
|
||||
for i, ln in enumerate(lines[:200]):
|
||||
if AGE_LINE_RE.search(ln): continue
|
||||
|
||||
# Look for summary line
|
||||
if EXP_HEADER_RE.search(ln):
|
||||
window = ln
|
||||
if i + 1 < len(lines): window += " " + lines[i+1]
|
||||
if i + 2 < len(lines): window += " " + lines[i+2]
|
||||
|
||||
m = EXP_SUMMARY_RE.search(window)
|
||||
if m:
|
||||
y = int(m.group("y"))
|
||||
mm = int(m.group("m")) if m.group("m") else 0
|
||||
total_years = float(round(y + (mm / 12.0), 2))
|
||||
if 0 <= total_years <= 60:
|
||||
dbg["method"] = "summary"
|
||||
dbg["matched"] = m.group(0)
|
||||
confidence = 0.95
|
||||
break
|
||||
|
||||
# Strategy B: Fallback nearby
|
||||
if total_years is None:
|
||||
safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
|
||||
for i, ln in enumerate(safe_lines):
|
||||
if not EXP_HEADER_RE.search(ln): continue
|
||||
chunk = " ".join(safe_lines[i : i + 12])
|
||||
m = EXP_NEARBY_RE.search(chunk)
|
||||
if m:
|
||||
y = int(m.group("y"))
|
||||
mm = int(m.group("m")) if m.group("m") else 0
|
||||
val = float(round(y + (mm / 12.0), 2))
|
||||
if 0 <= val <= 60:
|
||||
total_years = val
|
||||
dbg["method"] = "header_chunk"
|
||||
dbg["matched"] = m.group(0)
|
||||
confidence = 0.80
|
||||
break
|
||||
|
||||
# 2.5 Timeline/range fallback-reconciliation
|
||||
# Protects against cases where summary parser catches one short fragment
|
||||
# while CV has a long timeline.
|
||||
try:
|
||||
alt = extract_experience(text or "")
|
||||
except Exception:
|
||||
alt = None
|
||||
if alt and alt.years is not None:
|
||||
if total_years is None:
|
||||
total_years = alt.years
|
||||
confidence = max(confidence, alt.confidence)
|
||||
dbg["method"] = "timeline_fallback"
|
||||
dbg["matched"] = "date_ranges"
|
||||
elif alt.years > (total_years + 1.0):
|
||||
strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
|
||||
if strong_summary and (alt.years - float(total_years)) > 1.5:
|
||||
dbg["reconcile"] = "timeline_skip_strong_summary"
|
||||
else:
|
||||
total_years = alt.years
|
||||
confidence = max(confidence, min(0.82, alt.confidence))
|
||||
dbg["method"] = "timeline_reconcile"
|
||||
dbg["matched"] = "date_ranges"
|
||||
|
||||
# 3. Calculate Engineering Years
|
||||
eng_years = total_years
|
||||
if is_recruiter:
|
||||
# If they are a recruiter, their "engineering" experience is effectively 0
|
||||
# for the purpose of finding a Developer.
|
||||
eng_years = 0.0
|
||||
|
||||
return total_years, eng_years, confidence, dbg
|
||||
|
||||
|
||||
def _norm_phone(p: str) -> str:
|
||||
digits = re.sub(r"\D+", "", p)
|
||||
if digits.startswith("8") and len(digits) == 11:
|
||||
digits = "7" + digits[1:]
|
||||
return "+" + digits if digits else ""
|
||||
|
||||
def _norm_token(s: str) -> str:
|
||||
return re.sub(r"\s+", " ", s.strip().lower())
|
||||
|
||||
def safe_json(v) -> str:
|
||||
return json.dumps(v, ensure_ascii=False)
|
||||
|
||||
def extract_contacts(text: str) -> Dict[str, List[str]]:
|
||||
emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
|
||||
for m in EMAIL_SPLIT_RE.finditer(text or ""):
|
||||
prefix = m.group("prefix").strip().lower().strip(".-_")
|
||||
if not prefix or prefix in _EMAIL_PREFIX_STOP:
|
||||
continue
|
||||
if not re.search(r"[._\-\d]", prefix):
|
||||
continue
|
||||
tail = m.group("tail").lower()
|
||||
if "@" not in tail:
|
||||
continue
|
||||
local_tail, domain = tail.split("@", 1)
|
||||
local = f"{prefix}{local_tail}"
|
||||
if len(local) > 64:
|
||||
continue
|
||||
cand = f"{local}@{domain}"
|
||||
if EMAIL_RE.fullmatch(cand):
|
||||
emails_set.add(cand)
|
||||
emails = _prune_fragment_emails(sorted(emails_set))
|
||||
phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
|
||||
tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
|
||||
gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
|
||||
li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
|
||||
urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
|
||||
return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
|
||||
|
||||
def extract_name_guess(text: str) -> Optional[str]:
|
||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||
if not lines:
|
||||
return None
|
||||
|
||||
# 1) HH footer "Name • Резюме обновлено ..."
|
||||
m = HH_FOOTER_RE.search(text or "")
|
||||
if m:
|
||||
cand = m.group("name").strip()
|
||||
if _looks_like_name_line(cand):
|
||||
return cand
|
||||
|
||||
# 2) Key-value line: "Name: ..." / "Имя: ..."
|
||||
for ln in lines[:40]:
|
||||
m2 = NAME_KV_RE.match(ln)
|
||||
if m2:
|
||||
cand = m2.group(2).strip()
|
||||
cand = re.split(r"[|,/;]", cand)[0].strip()
|
||||
if _looks_like_name_line(cand):
|
||||
return cand
|
||||
|
||||
# 3) Name-like in first ~40 lines
|
||||
for ln in lines[:40]:
|
||||
if _looks_like_heading_line(ln):
|
||||
continue
|
||||
if _looks_like_name_line(ln):
|
||||
return ln
|
||||
|
||||
# 4) Name-like near the end (pptx exports often put name there)
|
||||
tail_start = max(0, len(lines) - 60)
|
||||
for i in range(tail_start, len(lines)):
|
||||
ln = lines[i]
|
||||
if _looks_like_heading_line(ln):
|
||||
continue
|
||||
ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
|
||||
if NAME_INSTITUTION_RE.search(ctx):
|
||||
continue
|
||||
if _looks_like_name_line(ln):
|
||||
return ln
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _looks_like_heading_line(line: str) -> bool:
|
||||
low = (line or "").strip().lower()
|
||||
if not low:
|
||||
return False
|
||||
if low in _NAME_BAD_WORDS:
|
||||
return True
|
||||
if low.startswith("top skills"):
|
||||
return True
|
||||
if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _looks_like_name_line(line: str) -> bool:
|
||||
if not line:
|
||||
return False
|
||||
if len(line) > 80:
|
||||
return False
|
||||
low = line.lower().strip()
|
||||
if low in NAME_STOPWORDS:
|
||||
return False
|
||||
if _looks_like_heading_line(line):
|
||||
return False
|
||||
if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
|
||||
return False
|
||||
if NAME_INSTITUTION_RE.search(line):
|
||||
return False
|
||||
if not NAME_LINE_RE.match(line.strip()):
|
||||
return False
|
||||
return True
|
||||
|
||||
def extract_remote(text: str) -> Optional[bool]:
|
||||
if not text:
|
||||
return None
|
||||
for ln in text.splitlines()[:120]:
|
||||
if REMOTE_RE.search(ln):
|
||||
return True
|
||||
return None
|
||||
|
||||
def extract_english(text: str) -> Optional[str]:
|
||||
t = text or ""
|
||||
lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
|
||||
|
||||
# 1) CEFR levels anywhere are accepted.
|
||||
m = EN_RE.search(t)
|
||||
if m:
|
||||
return m.group(1).replace("+", "").upper()
|
||||
|
||||
# 2) Textual levels only when English context is present.
|
||||
candidate_chunks: List[str] = []
|
||||
for i, ln in enumerate(lines):
|
||||
if EN_LANG_RE.search(ln):
|
||||
candidate_chunks.append(ln)
|
||||
if i + 1 < len(lines):
|
||||
candidate_chunks.append(lines[i + 1])
|
||||
|
||||
if not candidate_chunks:
|
||||
return None
|
||||
|
||||
m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
|
||||
if not m2:
|
||||
return None
|
||||
word = m2.group(1).lower()
|
||||
if word in ("native", "fluent", "proficient", "advanced"):
|
||||
return "C1"
|
||||
if word.startswith("upper"):
|
||||
return "B2"
|
||||
if word == "intermediate":
|
||||
return "B1"
|
||||
if word == "elementary":
|
||||
return "A2"
|
||||
return None
|
||||
|
||||
def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
|
||||
"""
|
||||
Extracts roles and skills, but strictly filters out HR/Recruitment context.
|
||||
"""
|
||||
lines = text.splitlines()
|
||||
|
||||
# 1. Filter text: Remove lines that talk about hiring/vacancies
|
||||
clean_lines = []
|
||||
for ln in lines:
|
||||
if not HR_CONTEXT_RE.search(ln):
|
||||
clean_lines.append(ln)
|
||||
|
||||
clean_text = "\n".join(clean_lines).lower()
|
||||
|
||||
# 2. Extract Skills from clean text only
|
||||
skills = []
|
||||
for s, pat in _SKILL_PATTERNS:
|
||||
if pat.search(clean_text):
|
||||
skills.append(normalize_skill(s) or s)
|
||||
skills = sorted(set(skills))
|
||||
|
||||
# 3. Extract Roles
|
||||
# Priority: Header (first 10 lines)
|
||||
header_text = "\n".join(lines[:10]).lower()
|
||||
|
||||
found_roles = set()
|
||||
|
||||
# Check if Recruiter
|
||||
if NON_TECH_ROLES_RE.search(header_text):
|
||||
# If explicit recruiter in header, do NOT add generic tech roles like "backend"
|
||||
# even if they appear in the text (often describes who they hire).
|
||||
pass
|
||||
else:
|
||||
# Normal extraction
|
||||
for r in ROLES:
|
||||
pats = _ROLE_PATTERNS.get(r, [])
|
||||
if any(p.search(clean_text) for p in pats):
|
||||
# extra guard: devops requires explicit evidence, not just CI/CD mentions
|
||||
if r == "devops":
|
||||
if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
|
||||
continue
|
||||
found_roles.add(r)
|
||||
|
||||
return sorted(list(found_roles)), skills
|
||||
|
||||
def norm_pipe(tokens: List[str]) -> str:
|
||||
toks = [_norm_token(t) for t in tokens if _norm_token(t)]
|
||||
uniq = sorted(set(toks))
|
||||
return "|" + "|".join(uniq) + "|" if uniq else "|"
|
||||
|
||||
def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
|
||||
dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
|
||||
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||||
if not lines:
|
||||
return None, None, 0.0, dbg
|
||||
|
||||
candidates: List[Tuple[int, str, bool, bool]] = []
|
||||
for i, ln in enumerate(lines):
|
||||
has_hint = SALARY_HINT_RE.search(ln) is not None
|
||||
has_pay = PAY_TOKEN_RE.search(ln) is not None
|
||||
if not has_hint and not has_pay:
|
||||
continue
|
||||
if SALARY_NOISE_RE.search(ln) and not has_hint:
|
||||
continue
|
||||
candidates.append((i, ln, has_hint, has_pay))
|
||||
|
||||
if not candidates:
|
||||
return None, None, 0.0, dbg
|
||||
|
||||
has_hint = any(x[2] for x in candidates)
|
||||
if not has_hint:
|
||||
# Inline pay without "salary" is allowed only near header/contact block.
|
||||
candidates = [x for x in candidates if x[0] < 15]
|
||||
if not candidates:
|
||||
return None, None, 0.0, dbg
|
||||
|
||||
scan_chunks: List[str] = []
|
||||
for i, ln, hint, _ in candidates:
|
||||
chunk = ln
|
||||
if hint and (i + 1) < len(lines):
|
||||
chunk = f"{chunk} {lines[i + 1]}"
|
||||
scan_chunks.append(chunk)
|
||||
dbg["used_lines"].append(ln)
|
||||
if hint:
|
||||
dbg["hint_lines"] += 1
|
||||
dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
|
||||
|
||||
nums: List[int] = []
|
||||
for chunk in scan_chunks:
|
||||
for m in NUM_RE.finditer(chunk):
|
||||
val = None
|
||||
if m.group(1) and m.group(2):
|
||||
val = int(m.group(1)) * 1000
|
||||
elif m.group(3):
|
||||
val = int(re.sub(r"\s+", "", m.group(3)))
|
||||
elif m.group(4):
|
||||
val = int(m.group(4))
|
||||
if val and 20_000 <= val <= 30_000_000:
|
||||
nums.append(val)
|
||||
dbg["numbers"].append(val)
|
||||
|
||||
if not nums:
|
||||
return None, None, 0.0, dbg
|
||||
|
||||
nums = sorted(nums)
|
||||
salary_min = nums[0]
|
||||
salary_max = nums[-1] if len(nums) > 1 else nums[0]
|
||||
|
||||
if dbg["hint_lines"] > 0:
|
||||
conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
|
||||
else:
|
||||
conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
|
||||
|
||||
if salary_max > salary_min * 4:
|
||||
conf -= 0.12
|
||||
if len(nums) == 1:
|
||||
conf -= 0.06
|
||||
|
||||
conf = max(0.0, min(conf, 0.9))
|
||||
if conf < 0.45:
|
||||
return None, None, conf, dbg
|
||||
return salary_min, salary_max, conf, dbg
|
||||
|
||||
def extract_location_best_effort(text: str) -> Optional[str]:
|
||||
if not text:
|
||||
return None
|
||||
|
||||
def _clean_loc(val: str) -> str:
|
||||
return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
|
||||
|
||||
def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
|
||||
v = _clean_loc(val)
|
||||
if not v or len(v) < 3 or len(v) > 90:
|
||||
return False
|
||||
if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
|
||||
return False
|
||||
if SECTION_HEADER_RE.match(v):
|
||||
return False
|
||||
if LOCATION_CITY_COUNTRY_RE.match(v):
|
||||
return True
|
||||
if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
|
||||
return True
|
||||
return False
|
||||
|
||||
patterns = [
|
||||
re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
|
||||
re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
|
||||
re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
|
||||
]
|
||||
for p in patterns:
|
||||
m = p.search(text)
|
||||
if m:
|
||||
val = _clean_loc(m.group(2))
|
||||
if _is_loc_like(val, allow_single=True):
|
||||
return val
|
||||
|
||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||
head: List[str] = []
|
||||
for ln in lines[:60]:
|
||||
if SECTION_HEADER_RE.match(ln):
|
||||
low = ln.lower()
|
||||
if low in ("contacts", "contact", "contact info"):
|
||||
continue
|
||||
break
|
||||
head.append(ln)
|
||||
|
||||
for ln in head:
|
||||
parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
|
||||
for seg in parts:
|
||||
if _is_loc_like(seg):
|
||||
return _clean_loc(seg)
|
||||
return None
|
||||
Reference in New Issue
Block a user