Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

39
extract/clean.py Normal file
View File

@@ -0,0 +1,39 @@
from __future__ import annotations
import re
from collections import Counter
import unicodedata
RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I)
RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U)
RE_MULTI_SPACE = re.compile(r"[ \t]+")
RE_MULTI_NL = re.compile(r"\n{3,}")
_INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"]
_BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]")
def normalize_text(raw: str) -> str:
text = raw.replace("\r\n", "\n").replace("\r", "\n")
for ch in _INVISIBLE_CHARS:
text = text.replace(ch, "")
text = _BIDI_CTRL_RE.sub("", text)
# remove most control/format chars but keep line breaks and tabs
text = "".join(
ch for ch in text
if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C")
)
text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b"))
lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")]
lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)]
counts = Counter(lines)
filtered = []
for ln in lines:
if counts[ln] >= 4 and len(ln) <= 90:
continue
filtered.append(ln)
text = "\n".join(filtered)
text = RE_MULTI_NL.sub("\n\n", text).strip()
return text
def to_fts_text(clean: str) -> str:
return re.sub(r"\s+", " ", clean).strip()

134
extract/doc_type.py Normal file
View File

@@ -0,0 +1,134 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class DocTypeResult:
doc_type: str
confidence: float
signals: List[str]
_HH_PATTERNS = [
(re.compile(r"\bhh\.ru\b", re.I), 2.0, "hh_domain"),
(re.compile(r"\bheadhunter\b", re.I), 2.0, "headhunter"),
(re.compile(r"\bрезюме\s+обновлено\b", re.I), 2.5, "hh_resume_updated"),
(re.compile(r"\елаемая\s+должность\b", re.I), 1.2, "hh_desired_role"),
(re.compile(r"\bключевые\s+навыки\b", re.I), 1.0, "hh_key_skills"),
(re.compile(r"\bопыт\s+работы\b", re.I), 0.8, "hh_experience"),
]
_LI_PATTERNS = [
(re.compile(r"\blinkedin\b", re.I), 2.5, "li_brand"),
(re.compile(r"\blinkedin\.com\b", re.I), 2.0, "li_domain"),
(re.compile(r"\bexperience\b", re.I), 0.9, "li_experience"),
(re.compile(r"\beducation\b", re.I), 0.9, "li_education"),
(re.compile(r"\bskills\b", re.I), 0.9, "li_skills"),
(re.compile(r"\babout\b", re.I), 0.6, "li_about"),
]
_PPTX_PATTERNS = [
(re.compile(r"\bslide\b", re.I), 1.0, "pptx_slide"),
(re.compile(r"\bpowerpoint\b", re.I), 1.3, "pptx_powerpoint"),
(re.compile(r"\bpptx\b", re.I), 1.3, "pptx_ext"),
(re.compile(r"\bpresentation\b", re.I), 0.8, "pptx_presentation"),
]
def _score_patterns(text: str, patterns) -> tuple[float, List[str]]:
score = 0.0
signals: List[str] = []
for rx, weight, name in patterns:
if rx.search(text):
score += weight
signals.append(name)
return score, signals
def _confidence_from_score(score: float) -> float:
if score >= 4.0:
return 0.92
if score >= 3.0:
return 0.85
if score >= 2.0:
return 0.75
if score >= 1.2:
return 0.62
if score > 0.0:
return 0.50
return 0.30
def detect_doc_type(clean_text: str, file_ext: Optional[str] = None) -> DocTypeResult:
lines = [ln.strip() for ln in (clean_text or "").splitlines() if ln.strip()]
head_lines = lines[:80]
head_text = "\n".join(head_lines)
head_lc = head_text.lower()
signals: List[str] = []
hh_score, hh_signals = _score_patterns(head_text, _HH_PATTERNS)
li_score, li_signals = _score_patterns(head_text, _LI_PATTERNS)
pptx_score, pptx_signals = _score_patterns(head_text, _PPTX_PATTERNS)
if file_ext and file_ext.lower() in (".pptx", ".ppt"):
pptx_score += 2.0
signals.append("pptx_ext")
signals.extend(hh_signals + li_signals + pptx_signals)
# One-page heuristic: short resumes with dense content
total_len = len(clean_text or "")
one_page_score = 0.0
if len(lines) <= 70 and total_len <= 4500:
one_page_score = 2.2
signals.append("one_page_short")
elif len(lines) <= 90 and total_len <= 6500:
one_page_score = 1.6
signals.append("one_page_medium")
# Scan heuristic: very low textual content
letters = sum(ch.isalpha() for ch in clean_text or "")
total = max(1, len(clean_text or ""))
letter_ratio = letters / total
scan_score = 0.0
if total_len < 200 or letter_ratio < 0.12:
scan_score = 3.2
signals.append("scan_low_text")
if file_ext and file_ext.lower() in (".pdf", ".png", ".jpg", ".jpeg", ".tiff"):
scan_score += 0.6
signals.append("scan_file_ext")
candidates = [
("hh_ru", hh_score),
("linkedin_pdf", li_score),
("pptx_export", pptx_score),
("one_page", one_page_score),
("scan_pdf", scan_score),
]
doc_type, best_score = max(candidates, key=lambda x: x[1])
if best_score <= 0.0:
base = "generic_pdf" if (file_ext and file_ext.lower() == ".pdf") else "generic"
return DocTypeResult(doc_type=base, confidence=0.35, signals=signals)
confidence = _confidence_from_score(best_score)
# If scan is detected strongly, prefer it
if doc_type == "scan_pdf" and confidence >= 0.8:
return DocTypeResult(doc_type="scan_pdf", confidence=confidence, signals=signals)
# Split one-page into ru/en
if doc_type == "one_page":
if _looks_cyrillic(head_text):
return DocTypeResult(doc_type="one_page_ru", confidence=confidence, signals=signals)
return DocTypeResult(doc_type="one_page_en", confidence=confidence, signals=signals)
return DocTypeResult(doc_type=doc_type, confidence=confidence, signals=signals)
def _looks_cyrillic(text: str) -> bool:
cyr = len(re.findall(r"[А-Яа-яЁё]", text))
lat = len(re.findall(r"[A-Za-z]", text))
return cyr > lat and cyr >= 10

159
extract/experience.py Normal file
View File

@@ -0,0 +1,159 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from datetime import date
from typing import Dict, List, Optional, Tuple
# Month maps (EN + RU)
MONTHS = {
"jan": 1, "january": 1, "янв": 1, "январ": 1,
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
"mar": 3, "march": 3, "мар": 3, "март": 3,
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
"may": 5, "май": 5,
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
"jul": 7, "july": 7, "июл": 7, "июль": 7,
"aug": 8, "august": 8, "авг": 8, "август": 8,
"sep": 9, "september": 9, "сен": 9, "сент": 9,
"oct": 10, "october": 10, "окт": 10, "октя": 10,
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
"dec": 12, "december": 12, "дек": 12, "дека": 12,
}
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\\.|по настоящее)\b", re.I)
# Direct "X years" patterns
DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
# Dates like 03.2019, 2019, Jan 2020, янв 2020
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
# Range separators
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—||-|to|по)\s*(?P<b>.+?)$", re.I)
@dataclass
class ExpResult:
years: Optional[float]
confidence: float
debug: Dict
def _clamp_years(y: float) -> Optional[float]:
if 0.0 <= y <= 45.0:
return y
return None
def _parse_mon(mon: str) -> Optional[int]:
m = mon.strip().lower()
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
# allow prefixes: "январ", "феврал"
for k, v in MONTHS.items():
if m.startswith(k):
return v
return None
def _as_ymd(y: int, m: int) -> date:
return date(y, m, 1)
def _parse_one_date(s: str) -> Optional[date]:
s = s.strip()
if PRESENT_RE.search(s):
today = date.today()
return date(today.year, today.month, 1)
m1 = MMYYYY_RE.search(s)
if m1:
mm = int(m1.group(1))
yy = int(m1.group(2))
return _as_ymd(yy, mm)
m2 = MON_YYYY_RE.search(s)
if m2:
mon = _parse_mon(m2.group(1))
yy = int(m2.group(2))
if mon:
return _as_ymd(yy, mon)
m3 = YYYY_RE.search(s)
if m3:
yy = int(m3.group(1))
return _as_ymd(yy, 1)
return None
def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
if not intervals:
return []
intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
merged = [intervals[0]]
for s, e in intervals[1:]:
ls, le = merged[-1]
if s <= le:
merged[-1] = (ls, max(le, e))
else:
merged.append((s, e))
return merged
def _months_between(a: date, b: date) -> int:
# month-level difference (inclusive-ish): b >= a
return (b.year - a.year) * 12 + (b.month - a.month)
def extract_experience(text: str) -> ExpResult:
debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
# 1) Direct years
directs = []
for m in DIRECT_YEARS_RE.finditer(text):
try:
v = float(m.group(1).replace(",", "."))
if 0 <= v <= 45:
directs.append(v)
debug["direct_matches"].append({"match": m.group(0), "value": v})
except Exception:
pass
if directs:
years = _clamp_years(max(directs))
return ExpResult(years=years, confidence=0.90, debug=debug)
# 2) Ranges in lines: try to detect "start - end"
intervals: List[Tuple[date, date]] = []
for line in text.splitlines():
ln = line.strip()
if len(ln) < 7:
continue
# require range separator
if not any(x in ln for x in ("", "", "-", " to ", " по ")):
continue
rr = RANGE_RE.match(ln)
if not rr:
continue
a = rr.group("a")
b = rr.group("b")
da = _parse_one_date(a)
db = _parse_one_date(b)
if da and db:
if db < da:
da, db = db, da
# cap extremely old
if da.year < 1990:
continue
intervals.append((da, db))
debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
intervals = _merge_intervals(intervals)
debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
if not intervals:
return ExpResult(years=None, confidence=0.0, debug=debug)
total_months = 0
for s, e in intervals:
total_months += max(0, _months_between(s, e))
years = round(total_months / 12.0, 2)
years = _clamp_years(years) if years is not None else None
# confidence depends on amount of evidence
conf = 0.70 if total_months >= 12 else 0.55
return ExpResult(years=years, confidence=conf, debug=debug)

View File

@@ -0,0 +1,144 @@
from __future__ import annotations
import re
from dataclasses import dataclass, asdict
from datetime import date
from typing import List, Optional
MONTHS = {
"jan": 1, "january": 1, "янв": 1, "январ": 1,
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
"mar": 3, "march": 3, "мар": 3, "март": 3,
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
"may": 5, "май": 5,
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
"jul": 7, "july": 7, "июл": 7, "июль": 7,
"aug": 8, "august": 8, "авг": 8, "август": 8,
"sep": 9, "september": 9, "сен": 9, "сент": 9,
"oct": 10, "october": 10, "окт": 10, "октя": 10,
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
"dec": 12, "december": 12, "дек": 12, "дека": 12,
}
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\\.|по настоящее)\b", re.I)
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—||-|to|по)\s*(?P<b>.+?)$", re.I)
YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—||-|to|по)\s*\d{4}\s*$", re.I)
EDU_CONTEXT_RE = re.compile(
r"\b("
r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
r")\b",
re.I,
)
@dataclass
class Position:
title: Optional[str]
company: Optional[str]
date_from: Optional[str]
date_to: Optional[str]
is_current: Optional[bool]
description: Optional[str]
def _parse_mon(mon: str) -> Optional[int]:
m = mon.strip().lower()
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
for k, v in MONTHS.items():
if m.startswith(k):
return v
return None
def _as_ymd(y: int, m: int) -> date:
return date(y, m, 1)
def _parse_one_date(s: str) -> Optional[date]:
s = s.strip()
if PRESENT_RE.search(s):
today = date.today()
return date(today.year, today.month, 1)
m1 = MMYYYY_RE.search(s)
if m1:
mm = int(m1.group(1))
yy = int(m1.group(2))
return _as_ymd(yy, mm)
m2 = MON_YYYY_RE.search(s)
if m2:
mon = _parse_mon(m2.group(1))
yy = int(m2.group(2))
if mon:
return _as_ymd(yy, mon)
m3 = YYYY_RE.search(s)
if m3:
yy = int(m3.group(1))
return _as_ymd(yy, 1)
return None
def extract_positions(text: str, max_items: int = 40) -> List[Position]:
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
positions: List[Position] = []
i = 0
while i < len(lines) and len(positions) < max_items:
ln = lines[i]
if not any(x in ln for x in ("", "", "-", " to ", " по ")):
i += 1
continue
rr = RANGE_RE.match(ln)
if not rr:
i += 1
continue
ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
i += 1
continue
da = _parse_one_date(rr.group("a"))
db = _parse_one_date(rr.group("b"))
if not da or not db:
i += 1
continue
if da.year < 1990:
i += 1
continue
is_current = PRESENT_RE.search(rr.group("b")) is not None
title = None
company = None
desc_lines: List[str] = []
if i + 1 < len(lines):
if EDU_CONTEXT_RE.search(lines[i + 1]):
i += 1
continue
header = lines[i + 1]
parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
if parts:
title = parts[0]
if len(parts) > 1:
company = parts[1]
j = i + 2
while j < len(lines):
if any(x in lines[j] for x in ("", "", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
break
desc_lines.append(lines[j])
j += 1
positions.append(
Position(
title=title,
company=company,
date_from=da.isoformat(),
date_to=db.isoformat(),
is_current=is_current,
description="\n".join(desc_lines).strip() if desc_lines else None,
)
)
i = j
return positions
def positions_to_dicts(items: List[Position]) -> List[dict]:
return [asdict(p) for p in items]

585
extract/llm.py Normal file
View File

@@ -0,0 +1,585 @@
from __future__ import annotations
import hashlib
import json
import os
import re
import sqlite3
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
try:
import httpx # type: ignore
except Exception: # pragma: no cover
httpx = None # type: ignore
def resolve_llm_runtime() -> Dict[str, str]:
"""
Resolve OpenAI-compatible runtime config.
Supports both generic vars and Mistral aliases:
- generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
- mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
"""
provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
model = (os.environ.get("LLM_MODEL") or "").strip()
api_key = (os.environ.get("LLM_API_KEY") or "").strip()
mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()
if not api_key and mistral_key:
api_key = mistral_key
if not model and mistral_model:
model = mistral_model
if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
base_url = mistral_base
if base_url:
base_url = base_url.rstrip("/")
if not provider:
if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
provider = "mistral"
else:
provider = "generic"
return {
"provider": provider,
"base_url": base_url,
"model": model,
"api_key": api_key,
}
# ------------- Public API -------------
def llm_parse_enabled() -> bool:
"""
Enabled only if httpx is available and both base_url/model are resolved.
Opt-out via LLM_PARSE_ENABLED=0.
"""
if httpx is None:
return False
if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
return False
runtime = resolve_llm_runtime()
return bool(runtime["base_url"]) and bool(runtime["model"])
_PROMPT_VERSION = "v3_sections_doc_type"
_REVIEW_PROMPT_VERSION = "v1_review_merge"
@dataclass
class LLMExtraction:
roles: List[str]
skills: List[str]
primary_languages: List[str]
seniority: Optional[str]
backend_focus: Optional[bool]
experience_years_total: Optional[float]
experience_years_engineering: Optional[float]
english_level: Optional[str]
location: Optional[str]
remote_ok: Optional[bool]
salary_min_usd: Optional[int]
salary_max_usd: Optional[int]
salary_min_rub: Optional[int]
salary_max_rub: Optional[int]
highlights: List[str]
keywords: List[str]
@staticmethod
def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
def _as_list(v: Any) -> List[str]:
if v is None:
return []
if isinstance(v, list):
return [str(x).strip() for x in v if str(x).strip()]
s = str(v).strip()
return [s] if s else []
def _as_float(v: Any) -> Optional[float]:
try:
return float(v)
except Exception:
return None
def _as_int(v: Any) -> Optional[int]:
try:
return int(float(v))
except Exception:
return None
def _as_bool(v: Any) -> Optional[bool]:
if isinstance(v, bool):
return v
if v is None:
return None
s = str(v).strip().lower()
if s in ("true", "1", "yes", "y"):
return True
if s in ("false", "0", "no", "n"):
return False
return None
return LLMExtraction(
roles=_as_list(obj.get("roles")),
skills=_as_list(obj.get("skills")),
primary_languages=_as_list(obj.get("primary_languages")),
seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
backend_focus=_as_bool(obj.get("backend_focus")),
experience_years_total=_as_float(obj.get("experience_years_total")),
experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
remote_ok=_as_bool(obj.get("remote_ok")),
salary_min_usd=_as_int(obj.get("salary_min_usd")),
salary_max_usd=_as_int(obj.get("salary_max_usd")),
salary_min_rub=_as_int(obj.get("salary_min_rub")),
salary_max_rub=_as_int(obj.get("salary_max_rub")),
highlights=_as_list(obj.get("highlights")),
keywords=_as_list(obj.get("keywords")),
)
def llm_extract_profile(
clean_text: str,
*,
con: Optional[sqlite3.Connection] = None,
doc_type: Optional[str] = None,
sections: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
"""
Returns (LLMExtraction | None, debug_info).
- Uses cache on disk/sqlite to keep throughput high.
- Silently degrades to None on any failure.
"""
runtime = resolve_llm_runtime()
dbg: Dict[str, Any] = {
"enabled": llm_parse_enabled(),
"provider": runtime.get("provider"),
"model": runtime.get("model"),
"from_cache": False,
"cache_backend": None,
"error": None,
"prompt_version": _PROMPT_VERSION,
}
if not llm_parse_enabled():
return None, dbg
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"
payload = _build_payload(
clean_text,
doc_type=doc_type,
sections=sections,
prompt_version=_PROMPT_VERSION,
temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
system_prompt="You output ONLY JSON for structured resume extraction.",
prompt_template=_PROMPT_TEMPLATE,
)
data = _cached_llm_json_call(
con=con,
cache_key=cache_key,
model=runtime["model"],
payload=payload,
dbg=dbg,
)
if data is None:
return None, dbg
return LLMExtraction.from_obj(data), dbg
def llm_review_profile(
clean_text: str,
*,
draft: Dict[str, Any],
con: Optional[sqlite3.Connection] = None,
doc_type: Optional[str] = None,
sections: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
"""
Second-pass validator:
- Takes already parsed JSON (draft)
- Re-checks every field against resume text
- Returns corrected extraction for safe merge in pipeline
"""
runtime = resolve_llm_runtime()
dbg: Dict[str, Any] = {
"enabled": llm_parse_enabled(),
"provider": runtime.get("provider"),
"model": runtime.get("model"),
"from_cache": False,
"cache_backend": None,
"error": None,
"prompt_version": _REVIEW_PROMPT_VERSION,
"quality_score": None,
"changed_fields": [],
"issues_found": [],
}
if not llm_parse_enabled():
return None, dbg
clean_draft = _sanitize_review_draft(draft)
draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"
payload = _build_payload(
clean_text,
doc_type=doc_type,
sections=sections,
prompt_version=_REVIEW_PROMPT_VERSION,
temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
system_prompt="You output ONLY JSON for resume parsing quality review.",
prompt_template=_REVIEW_PROMPT_TEMPLATE,
extra_vars={"draft_json": draft_blob},
)
data = _cached_llm_json_call(
con=con,
cache_key=cache_key,
model=runtime["model"],
payload=payload,
dbg=dbg,
)
if data is None:
return None, dbg
corrected_obj: Dict[str, Any]
if isinstance(data.get("corrected"), dict):
corrected_obj = data["corrected"]
else:
corrected_obj = data
dbg["quality_score"] = _as_float(data.get("quality_score"))
dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
dbg["issues_found"] = _as_str_list(data.get("issues_found"))
return LLMExtraction.from_obj(corrected_obj), dbg
# ------------- Internal helpers -------------
_PROMPT_TEMPLATE = """
Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
Схема:
{{
"roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
"skills": ["python","go","k8s","postgres","react", "..."],
"primary_languages": ["python","go","java","c++", "..."],
"seniority": "intern|junior|middle|senior|lead|principal|null",
"backend_focus": true|false|null,
"experience_years_total": number|null,
"experience_years_engineering": number|null,
"english_level": "A1|A2|B1|B2|C1|C2|null",
"location": "city, country|null",
"remote_ok": true|false|null,
"salary_min_usd": int|null,
"salary_max_usd": int|null,
"salary_min_rub": int|null,
"salary_max_rub": int|null,
"highlights": ["кратко достижения (1-2 предложения)"],
"keywords": ["уникальные ключевые слова, продукты или домены"]
}}
Не включай контактные данные в skills/keywords.
Detected doc_type: {doc_type}
Sections (if present):
{sections_block}
Full text snippet (use only if needed):
```TEXT
{resume_text}
```
"""
_REVIEW_PROMPT_TEMPLATE = """
Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.
Верни JSON строго такой формы:
{{
"corrected": {{
"roles": ["..."],
"skills": ["..."],
"primary_languages": ["..."],
"seniority": "intern|junior|middle|senior|lead|principal|null",
"backend_focus": true|false|null,
"experience_years_total": number|null,
"experience_years_engineering": number|null,
"english_level": "A1|A2|B1|B2|C1|C2|null",
"location": "city, country|null",
"remote_ok": true|false|null,
"salary_min_usd": int|null,
"salary_max_usd": int|null,
"salary_min_rub": int|null,
"salary_max_rub": int|null,
"highlights": ["..."],
"keywords": ["..."]
}},
"changed_fields": ["field_name", "..."],
"issues_found": ["кратко что было неверно/сомнительно", "..."],
"quality_score": 0.0
}}
Черновик JSON:
```DRAFT
{draft_json}
```
Detected doc_type: {doc_type}
Sections (if present):
{sections_block}
Full text snippet (use only if needed):
```TEXT
{resume_text}
```
"""
def _trim_text(text: str, max_len: int = 9000) -> str:
"""
Keep head and tail to preserve summary + recent projects.
"""
if len(text) <= max_len:
return text
head = text[: max_len // 2]
tail = text[-max_len // 2 :]
return head + "\n...\n" + tail
def _build_payload(
clean_text: str,
*,
doc_type: Optional[str],
sections: Optional[Dict[str, str]],
prompt_version: str,
temperature: float,
max_tokens: int,
system_prompt: str,
prompt_template: str,
extra_vars: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
runtime = resolve_llm_runtime()
base_url = runtime["base_url"]
model = runtime["model"]
sections_block = _build_sections_block(sections)
tpl_vars = {
"resume_text": _trim_text(clean_text),
"doc_type": (doc_type or "unknown"),
"sections_block": sections_block or "(no sections detected)",
}
if extra_vars:
tpl_vars.update(extra_vars)
prompt = prompt_template.format(**tpl_vars)
return {
"base_url": base_url,
"model": model,
"prompt_version": prompt_version,
"payload": {
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"temperature": temperature,
"max_tokens": max_tokens,
},
"headers": _build_headers(runtime),
"timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
}
def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
headers = {"Content-Type": "application/json"}
api_key = runtime.get("api_key", "")
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
return headers
def _cached_llm_json_call(
*,
con: Optional[sqlite3.Connection],
cache_key: str,
model: str,
payload: Dict[str, Any],
dbg: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
data = _cache_get_sqlite(con, cache_key)
if data:
dbg["from_cache"] = True
dbg["cache_backend"] = "sqlite"
return data
cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
cache_ok = True
try:
cache_dir.mkdir(parents=True, exist_ok=True)
except Exception:
cache_ok = False
safe_name = cache_key.replace(":", "_")
cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None
if cache_path and cache_path.exists():
try:
data = json.loads(cache_path.read_text(encoding="utf-8"))
dbg["from_cache"] = True
dbg["cache_backend"] = "disk"
return data
except Exception:
pass
try:
data = _llm_call_json(payload)
if con:
_cache_put_sqlite(con, cache_key, model, data)
if cache_path:
cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
return data
except Exception as e: # pragma: no cover - network/LLM failures
dbg["error"] = repr(e)
return None
def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
if httpx is None:
raise RuntimeError("httpx is not installed")
base_url: str = task["base_url"]
payload: Dict[str, Any] = task["payload"]
timeout = float(task.get("timeout", 18.0))
with httpx.Client(timeout=timeout) as client:
r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
r.raise_for_status()
data = r.json()
content = data["choices"][0]["message"]["content"]
if isinstance(content, list):
parts = []
for block in content:
if isinstance(block, dict):
parts.append(str(block.get("text") or ""))
else:
parts.append(str(block))
content = "\n".join(parts)
content = str(content)
m = re.search(r"\{.*\}", content, flags=re.S)
if not m:
raise ValueError("LLM did not return JSON")
return json.loads(m.group(0))
def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
if not sections:
return ""
parts: List[str] = []
order = [
("about", "ABOUT"),
("skills", "SKILLS"),
("experience", "EXPERIENCE"),
("education", "EDUCATION"),
("contacts", "CONTACTS"),
]
for key, label in order:
text = sections.get(key)
if not text:
continue
snippet = _trim_text(text, max_len=1800)
parts.append(f"[{label}]\n{snippet}")
return "\n\n".join(parts)
def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(draft, dict):
draft = {}
allowed = {
"roles",
"skills",
"primary_languages",
"seniority",
"backend_focus",
"experience_years_total",
"experience_years_engineering",
"english_level",
"location",
"remote_ok",
"salary_min_usd",
"salary_max_usd",
"salary_min_rub",
"salary_max_rub",
"highlights",
"keywords",
}
cleaned = {k: v for k, v in draft.items() if k in allowed}
return asdict(LLMExtraction.from_obj(cleaned))
def _as_float(v: Any) -> Optional[float]:
try:
x = float(v)
except Exception:
return None
if x < 0:
return None
if x > 1.0:
return 1.0
return x
def _as_str_list(v: Any) -> List[str]:
if v is None:
return []
if isinstance(v, list):
return [str(x).strip() for x in v if str(x).strip()]
s = str(v).strip()
return [s] if s else []
def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
if con is None:
return None
try:
row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
if row and row["result_json"]:
return json.loads(row["result_json"])
except Exception:
return None
return None
def _cache_put_sqlite(
con: Optional[sqlite3.Connection],
cache_key: str,
model: str,
data: Dict[str, Any],
) -> None:
if con is None:
return
try:
con.execute(
"INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
(cache_key, model, json.dumps(data, ensure_ascii=False)),
)
except Exception:
return

659
extract/parse.py Normal file
View File

@@ -0,0 +1,659 @@
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from tg_resume_db.normalize import normalize_skill
from tg_resume_db.extract.experience import extract_experience
EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
EMAIL_SPLIT_RE = re.compile(
r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
re.I,
)
PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
EN_TEXT_RE = re.compile(
r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
re.I,
)
EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
REMOTE_RE = re.compile(
r"\b("
r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
r")\b",
re.I,
)
# Salary (rough)
CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
SALARY_HINT_RE = re.compile(
r"\b("
r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
r")\b",
re.I,
)
PAY_TOKEN_RE = re.compile(
r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
re.I,
)
SALARY_NOISE_RE = re.compile(
r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
r"companies?|followers?|downloads?|clients?)\b",
re.I,
)
SECTION_HEADER_RE = re.compile(
r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
re.I,
)
LOCATION_CITY_COUNTRY_RE = re.compile(
r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
)
# --- SKILLS & ROLES ---
SKILLS = {
"python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
"sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
"aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
"pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
}
_SKILL_ALIASES: Dict[str, List[str]] = {
"javascript": ["java script", "java-script", "js"],
"typescript": ["type script", "type-script", "ts"],
"postgresql": ["postgres", "postgre sql", "postgre-sql"],
"graphql": ["graph ql"],
"grpc": ["g rpc"],
}
def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
patterns: List[Tuple[str, re.Pattern]] = []
for skill in sorted(SKILLS):
aliases = [skill] + _SKILL_ALIASES.get(skill, [])
for alias in aliases:
if skill == "java" and alias == "java":
# Do not match "java" inside "java script".
pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
else:
pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
patterns.append((skill, pat))
return patterns
_SKILL_PATTERNS = _build_skill_patterns()
ROLES = {
"backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
"mobile","android","ios","team lead","tech lead","architect"
}
_ROLE_ALIASES: Dict[str, List[str]] = {
"backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
"frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
"fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
"devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
"qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
"sre": ["sre", "site reliability"],
"data engineer": ["data engineer"],
"data scientist": ["data scientist"],
"ml engineer": ["ml engineer", "machine learning engineer"],
"mobile": ["mobile developer", "mobile engineer"],
"android": ["android developer", "android engineer"],
"ios": ["ios developer", "ios engineer"],
"team lead": ["team lead", "teamlead"],
"tech lead": ["tech lead", "techlead"],
"architect": ["architect", "solution architect", "software architect"],
}
def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
out: Dict[str, List[re.Pattern]] = {}
for role in ROLES:
aliases = _ROLE_ALIASES.get(role, [role])
out[role] = [
re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
for a in aliases
]
return out
_ROLE_PATTERNS = _build_role_patterns()
# --- HR / RECRUITER FILTERS ---
# Words that indicate the line is about searching for candidates, not owning the skill.
HR_CONTEXT_RE = re.compile(
r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
re.I
)
# Roles that explicitly define the person as Non-Engineering
NON_TECH_ROLES_RE = re.compile(
r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
re.I
)
# --- EXPERIENCE ---
AGE_LINE_RE = re.compile(
r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
)
EXP_HEADER_RE = re.compile(
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
)
# "5 years 10 months"
EXP_SUMMARY_RE = re.compile(
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
r"[^0-9]{0,20}"
r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
)
EXP_NEARBY_RE = re.compile(
r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
)
HH_FOOTER_RE = re.compile(
r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
re.I,
)
NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
NAME_LINE_RE = re.compile(
r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
)
NAME_STOPWORDS = {
"resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
"projects", "about", "profile", "objective", "навыки", "опыт", "образование",
"контакты", "профиль", "цель", "резюме",
"developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
"backend developer", "frontend developer", "fullstack developer", "software engineer",
"разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
"top skills", "experience", "education", "languages", "certifications",
"skills & endorsements", "endorsements",
"university", "state university", "institute", "college", "academy", "school",
"bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
"колледж", "школа", "бакалавр", "магистр", "факультет",
}
_NAME_BAD_WORDS = {
"skills", "top skills", "experience", "education", "languages", "certifications",
"projects", "summary", "about", "profile", "endorsements",
"university", "institute", "college", "academy", "school",
"bachelor", "master", "degree", "faculty",
}
NAME_INSTITUTION_RE = re.compile(
r"\b("
r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
r"mathematics|computer science|informatics|physics|economics|management|"
r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
r"математик|информатик|физик|экономик|менеджмент"
r")\b",
re.I,
)
_EMAIL_PREFIX_STOP = {
"email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
}
def _prune_fragment_emails(values: List[str]) -> List[str]:
uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
out: List[str] = []
for e in uniq:
local, domain = e.split("@", 1)
drop = False
for other in uniq:
if other == e:
continue
ol, od = other.split("@", 1)
if od != domain:
continue
if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
drop = True
break
if not drop:
out.append(e)
return out
def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
"""
Returns (total_years, engineering_years, confidence, debug).
Logic:
1. Calculate TOTAL experience from summaries.
2. Check if the candidate is primarily a Recruiter/HR.
- If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
- If NO: engineering_years = total_years (Optimistic assumption for valid devs).
"""
dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
total_years: Optional[float] = None
confidence = 0.0
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
# 1. Detect if Recruiter
# Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
header_text = "\n".join(lines[:15])
is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
dbg["is_recruiter"] = is_recruiter
# 2. Extract Total Duration
if lines:
# Strategy A: Explicit summary
for i, ln in enumerate(lines[:200]):
if AGE_LINE_RE.search(ln): continue
# Look for summary line
if EXP_HEADER_RE.search(ln):
window = ln
if i + 1 < len(lines): window += " " + lines[i+1]
if i + 2 < len(lines): window += " " + lines[i+2]
m = EXP_SUMMARY_RE.search(window)
if m:
y = int(m.group("y"))
mm = int(m.group("m")) if m.group("m") else 0
total_years = float(round(y + (mm / 12.0), 2))
if 0 <= total_years <= 60:
dbg["method"] = "summary"
dbg["matched"] = m.group(0)
confidence = 0.95
break
# Strategy B: Fallback nearby
if total_years is None:
safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
for i, ln in enumerate(safe_lines):
if not EXP_HEADER_RE.search(ln): continue
chunk = " ".join(safe_lines[i : i + 12])
m = EXP_NEARBY_RE.search(chunk)
if m:
y = int(m.group("y"))
mm = int(m.group("m")) if m.group("m") else 0
val = float(round(y + (mm / 12.0), 2))
if 0 <= val <= 60:
total_years = val
dbg["method"] = "header_chunk"
dbg["matched"] = m.group(0)
confidence = 0.80
break
# 2.5 Timeline/range fallback-reconciliation
# Protects against cases where summary parser catches one short fragment
# while CV has a long timeline.
try:
alt = extract_experience(text or "")
except Exception:
alt = None
if alt and alt.years is not None:
if total_years is None:
total_years = alt.years
confidence = max(confidence, alt.confidence)
dbg["method"] = "timeline_fallback"
dbg["matched"] = "date_ranges"
elif alt.years > (total_years + 1.0):
strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
if strong_summary and (alt.years - float(total_years)) > 1.5:
dbg["reconcile"] = "timeline_skip_strong_summary"
else:
total_years = alt.years
confidence = max(confidence, min(0.82, alt.confidence))
dbg["method"] = "timeline_reconcile"
dbg["matched"] = "date_ranges"
# 3. Calculate Engineering Years
eng_years = total_years
if is_recruiter:
# If they are a recruiter, their "engineering" experience is effectively 0
# for the purpose of finding a Developer.
eng_years = 0.0
return total_years, eng_years, confidence, dbg
def _norm_phone(p: str) -> str:
digits = re.sub(r"\D+", "", p)
if digits.startswith("8") and len(digits) == 11:
digits = "7" + digits[1:]
return "+" + digits if digits else ""
def _norm_token(s: str) -> str:
return re.sub(r"\s+", " ", s.strip().lower())
def safe_json(v) -> str:
return json.dumps(v, ensure_ascii=False)
def extract_contacts(text: str) -> Dict[str, List[str]]:
emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
for m in EMAIL_SPLIT_RE.finditer(text or ""):
prefix = m.group("prefix").strip().lower().strip(".-_")
if not prefix or prefix in _EMAIL_PREFIX_STOP:
continue
if not re.search(r"[._\-\d]", prefix):
continue
tail = m.group("tail").lower()
if "@" not in tail:
continue
local_tail, domain = tail.split("@", 1)
local = f"{prefix}{local_tail}"
if len(local) > 64:
continue
cand = f"{local}@{domain}"
if EMAIL_RE.fullmatch(cand):
emails_set.add(cand)
emails = _prune_fragment_emails(sorted(emails_set))
phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
def extract_name_guess(text: str) -> Optional[str]:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if not lines:
return None
# 1) HH footer "Name • Резюме обновлено ..."
m = HH_FOOTER_RE.search(text or "")
if m:
cand = m.group("name").strip()
if _looks_like_name_line(cand):
return cand
# 2) Key-value line: "Name: ..." / "Имя: ..."
for ln in lines[:40]:
m2 = NAME_KV_RE.match(ln)
if m2:
cand = m2.group(2).strip()
cand = re.split(r"[|,/;]", cand)[0].strip()
if _looks_like_name_line(cand):
return cand
# 3) Name-like in first ~40 lines
for ln in lines[:40]:
if _looks_like_heading_line(ln):
continue
if _looks_like_name_line(ln):
return ln
# 4) Name-like near the end (pptx exports often put name there)
tail_start = max(0, len(lines) - 60)
for i in range(tail_start, len(lines)):
ln = lines[i]
if _looks_like_heading_line(ln):
continue
ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
if NAME_INSTITUTION_RE.search(ctx):
continue
if _looks_like_name_line(ln):
return ln
return None
def _looks_like_heading_line(line: str) -> bool:
low = (line or "").strip().lower()
if not low:
return False
if low in _NAME_BAD_WORDS:
return True
if low.startswith("top skills"):
return True
if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
return True
return False
def _looks_like_name_line(line: str) -> bool:
if not line:
return False
if len(line) > 80:
return False
low = line.lower().strip()
if low in NAME_STOPWORDS:
return False
if _looks_like_heading_line(line):
return False
if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
return False
if NAME_INSTITUTION_RE.search(line):
return False
if not NAME_LINE_RE.match(line.strip()):
return False
return True
def extract_remote(text: str) -> Optional[bool]:
if not text:
return None
for ln in text.splitlines()[:120]:
if REMOTE_RE.search(ln):
return True
return None
def extract_english(text: str) -> Optional[str]:
t = text or ""
lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
# 1) CEFR levels anywhere are accepted.
m = EN_RE.search(t)
if m:
return m.group(1).replace("+", "").upper()
# 2) Textual levels only when English context is present.
candidate_chunks: List[str] = []
for i, ln in enumerate(lines):
if EN_LANG_RE.search(ln):
candidate_chunks.append(ln)
if i + 1 < len(lines):
candidate_chunks.append(lines[i + 1])
if not candidate_chunks:
return None
m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
if not m2:
return None
word = m2.group(1).lower()
if word in ("native", "fluent", "proficient", "advanced"):
return "C1"
if word.startswith("upper"):
return "B2"
if word == "intermediate":
return "B1"
if word == "elementary":
return "A2"
return None
def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
"""
Extracts roles and skills, but strictly filters out HR/Recruitment context.
"""
lines = text.splitlines()
# 1. Filter text: Remove lines that talk about hiring/vacancies
clean_lines = []
for ln in lines:
if not HR_CONTEXT_RE.search(ln):
clean_lines.append(ln)
clean_text = "\n".join(clean_lines).lower()
# 2. Extract Skills from clean text only
skills = []
for s, pat in _SKILL_PATTERNS:
if pat.search(clean_text):
skills.append(normalize_skill(s) or s)
skills = sorted(set(skills))
# 3. Extract Roles
# Priority: Header (first 10 lines)
header_text = "\n".join(lines[:10]).lower()
found_roles = set()
# Check if Recruiter
if NON_TECH_ROLES_RE.search(header_text):
# If explicit recruiter in header, do NOT add generic tech roles like "backend"
# even if they appear in the text (often describes who they hire).
pass
else:
# Normal extraction
for r in ROLES:
pats = _ROLE_PATTERNS.get(r, [])
if any(p.search(clean_text) for p in pats):
# extra guard: devops requires explicit evidence, not just CI/CD mentions
if r == "devops":
if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
continue
found_roles.add(r)
return sorted(list(found_roles)), skills
def norm_pipe(tokens: List[str]) -> str:
toks = [_norm_token(t) for t in tokens if _norm_token(t)]
uniq = sorted(set(toks))
return "|" + "|".join(uniq) + "|" if uniq else "|"
def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
if not lines:
return None, None, 0.0, dbg
candidates: List[Tuple[int, str, bool, bool]] = []
for i, ln in enumerate(lines):
has_hint = SALARY_HINT_RE.search(ln) is not None
has_pay = PAY_TOKEN_RE.search(ln) is not None
if not has_hint and not has_pay:
continue
if SALARY_NOISE_RE.search(ln) and not has_hint:
continue
candidates.append((i, ln, has_hint, has_pay))
if not candidates:
return None, None, 0.0, dbg
has_hint = any(x[2] for x in candidates)
if not has_hint:
# Inline pay without "salary" is allowed only near header/contact block.
candidates = [x for x in candidates if x[0] < 15]
if not candidates:
return None, None, 0.0, dbg
scan_chunks: List[str] = []
for i, ln, hint, _ in candidates:
chunk = ln
if hint and (i + 1) < len(lines):
chunk = f"{chunk} {lines[i + 1]}"
scan_chunks.append(chunk)
dbg["used_lines"].append(ln)
if hint:
dbg["hint_lines"] += 1
dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
nums: List[int] = []
for chunk in scan_chunks:
for m in NUM_RE.finditer(chunk):
val = None
if m.group(1) and m.group(2):
val = int(m.group(1)) * 1000
elif m.group(3):
val = int(re.sub(r"\s+", "", m.group(3)))
elif m.group(4):
val = int(m.group(4))
if val and 20_000 <= val <= 30_000_000:
nums.append(val)
dbg["numbers"].append(val)
if not nums:
return None, None, 0.0, dbg
nums = sorted(nums)
salary_min = nums[0]
salary_max = nums[-1] if len(nums) > 1 else nums[0]
if dbg["hint_lines"] > 0:
conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
else:
conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
if salary_max > salary_min * 4:
conf -= 0.12
if len(nums) == 1:
conf -= 0.06
conf = max(0.0, min(conf, 0.9))
if conf < 0.45:
return None, None, conf, dbg
return salary_min, salary_max, conf, dbg
def extract_location_best_effort(text: str) -> Optional[str]:
if not text:
return None
def _clean_loc(val: str) -> str:
return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
v = _clean_loc(val)
if not v or len(v) < 3 or len(v) > 90:
return False
if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
return False
if SECTION_HEADER_RE.match(v):
return False
if LOCATION_CITY_COUNTRY_RE.match(v):
return True
if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
return True
return False
patterns = [
re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
]
for p in patterns:
m = p.search(text)
if m:
val = _clean_loc(m.group(2))
if _is_loc_like(val, allow_single=True):
return val
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
head: List[str] = []
for ln in lines[:60]:
if SECTION_HEADER_RE.match(ln):
low = ln.lower()
if low in ("contacts", "contact", "contact info"):
continue
break
head.append(ln)
for ln in head:
parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
for seg in parts:
if _is_loc_like(seg):
return _clean_loc(seg)
return None

211
extract/pdf_extract.py Normal file
View File

@@ -0,0 +1,211 @@
from __future__ import annotations
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple
try: # optional dependency
from pypdf import PdfReader # type: ignore
except Exception: # pragma: no cover
try:
from PyPDF2 import PdfReader # type: ignore
except Exception: # pragma: no cover
PdfReader = None # type: ignore
try: # optional dependency
from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore
except Exception: # pragma: no cover
pdfminer_extract_text = None # type: ignore
@dataclass
class PdfExtractResult:
text: str
pages: List[dict]
method: str
score: float
flags: List[str]
_SECTION_HINTS = [
"experience", "work experience", "skills", "education", "projects", "summary", "about",
"опыт работы", "навыки", "образование", "проекты", "о себе",
]
def _which_pdftotext() -> Optional[str]:
exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
return exe
def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
exe = _which_pdftotext()
if not exe:
return ""
cmd = [exe]
if layout:
cmd.append("-layout")
cmd += ["-nopgbrk", str(path), "-"]
try:
p = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout_sec,
check=False,
text=True,
encoding="utf-8",
errors="ignore",
)
return (p.stdout or "").strip()
except Exception:
return ""
def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
if PdfReader is None:
return []
try:
reader = PdfReader(str(path), strict=False)
except Exception:
return []
pages: List[dict] = []
for i, page in enumerate(getattr(reader, "pages", [])):
if max_pages and i >= max_pages:
break
try:
text = page.extract_text() or ""
except Exception:
text = ""
pages.append({"page": i + 1, "text": text})
return pages
def _extract_pdfminer(path: Path) -> str:
if pdfminer_extract_text is None:
return ""
try:
return (pdfminer_extract_text(str(path)) or "").strip()
except Exception:
return ""
def _quality_score(text: str) -> Tuple[float, List[str]]:
flags: List[str] = []
if not text:
return 0.0, ["empty"]
total = len(text)
letters = sum(ch.isalpha() for ch in text)
spaces = text.count(" ")
alpha_ratio = letters / max(1, total)
space_ratio = spaces / max(1, total)
words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
lines = [ln for ln in text.splitlines() if ln.strip()]
long_lines = [ln for ln in lines if len(ln) > 200]
long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
score = 0.0
if alpha_ratio >= 0.45:
score += 2.0
elif alpha_ratio >= 0.30:
score += 1.0
else:
flags.append("low_alpha")
if 0.10 <= space_ratio <= 0.28:
score += 1.0
else:
flags.append("odd_spacing")
if 3.5 <= avg_word_len <= 9.0:
score += 1.0
else:
flags.append("odd_word_len")
if long_line_ratio <= 0.06:
score += 1.0
else:
flags.append("long_lines")
if glued_hits <= 6:
score += 1.0
else:
flags.append("glued_text")
if section_hits >= 2:
score += 1.0
elif section_hits == 1:
score += 0.5
if total < 200:
flags.append("short_text")
if alpha_ratio < 0.08 or total < 120:
flags.append("scan_like")
return score, flags
def deglue_text(text: str) -> str:
if not text:
return text
t = text
t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
return t
def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
candidates: List[Tuple[str, str]] = []
txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
if txt_layout:
candidates.append(("pdftotext_layout", txt_layout))
txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
if txt_plain:
candidates.append(("pdftotext_plain", txt_plain))
txt_pypdf = ""
if PdfReader is not None:
pages = _extract_pages_pypdf(path)
if pages:
txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
if txt_pypdf:
candidates.append(("pypdf", txt_pypdf))
txt_pdfminer = _extract_pdfminer(path)
if txt_pdfminer:
candidates.append(("pdfminer", txt_pdfminer))
if not candidates:
return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
best_method = "none"
best_text = ""
best_score = -1.0
best_flags: List[str] = []
for method, text in candidates:
score, flags = _quality_score(text)
if score > best_score:
best_score = score
best_method = method
best_text = text
best_flags = flags
pages = _extract_pages_pypdf(path)
best_text = deglue_text(best_text)
return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)

70
extract/sections.py Normal file
View File

@@ -0,0 +1,70 @@
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
"contacts": [
re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
],
"about": [
re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
],
"skills": [
re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
],
"experience": [
re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
],
"education": [
re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
],
"projects": [
re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
],
"languages": [
re.compile(r"^\s*(languages?|языки)\s*$", re.I),
],
"certifications": [
re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
],
"publications": [
re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
],
}
def _match_header(line: str) -> Optional[str]:
for key, patterns in _SECTION_PATTERNS.items():
for rx in patterns:
if rx.match(line):
return key
return None
def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
lines = [ln.strip() for ln in (clean_text or "").splitlines()]
sections: Dict[str, List[str]] = {"header": []}
current = "header"
for ln in lines:
if not ln:
continue
key = _match_header(ln)
if key:
current = key
sections.setdefault(current, [])
continue
sections.setdefault(current, []).append(ln)
out: Dict[str, str] = {}
for k, vals in sections.items():
text = "\n".join(vals).strip()
if text:
out[k] = text
return out
def sections_present(sections: Dict[str, str]) -> List[str]:
return sorted([k for k, v in (sections or {}).items() if v and k != "header"])

View File

@@ -0,0 +1 @@
__all__ = []

View File

@@ -0,0 +1,46 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
text = clean_text or ""
contacts_raw = extract_contacts(text)
name = extract_name_guess(text)
remote = extract_remote(text)
english = extract_english(text)
roles, skills = extract_roles_skills(text)
location = extract_location_best_effort(text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "generic_heur",
}

58
extract/templates/hh.py Normal file
View File

@@ -0,0 +1,58 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
if not sections:
return fallback
return sections.get(key) or fallback
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
header_text = _pick(sections, "header", clean_text)
contacts_text = _pick(sections, "contacts", clean_text)
about_text = _pick(sections, "about", clean_text)
skills_text = _pick(sections, "skills", clean_text)
exp_text = _pick(sections, "experience", clean_text)
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
name = extract_name_guess(header_text)
contacts_raw = extract_contacts(contacts_text)
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
remote = extract_remote(clean_text)
english = extract_english(clean_text)
location = extract_location_best_effort(clean_text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "hh_template",
}

View File

@@ -0,0 +1,85 @@
from __future__ import annotations
import re
from typing import Any, Dict, Optional
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
_DESIRED_RE = re.compile(r"(?i)жел[а-я]*\s+должност[ьи]\s*[:\-]?\s*(.+)")
_SPEC_RE = re.compile(r"(?i)специализаци[яи]\s*[:\-]?\s*(.+)")
_SCHEDULE_RE = re.compile(r"(?i)график\s+работы\s*[:\-]?\s*(.+)")
_EMPLOYMENT_RE = re.compile(r"(?i)занятость\s*[:\-]?\s*(.+)")
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
if not sections:
return fallback
return sections.get(key) or fallback
def _find_first(regex: re.Pattern, text: str) -> Optional[str]:
for ln in text.splitlines():
m = regex.search(ln)
if m:
val = m.group(1).strip()
val = re.split(r"[|;/]", val)[0].strip()
if 2 <= len(val) <= 80:
return val
return None
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
header_text = _pick(sections, "header", clean_text)
contacts_text = _pick(sections, "contacts", clean_text)
about_text = _pick(sections, "about", clean_text)
skills_text = _pick(sections, "skills", clean_text)
exp_text = _pick(sections, "experience", clean_text)
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
name = extract_name_guess(header_text)
contacts_raw = extract_contacts(contacts_text)
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
remote = extract_remote(clean_text)
english = extract_english(clean_text)
location = extract_location_best_effort(clean_text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
desired_title = _find_first(_DESIRED_RE, clean_text)
specializations = _find_first(_SPEC_RE, clean_text)
schedule = _find_first(_SCHEDULE_RE, clean_text)
employment = _find_first(_EMPLOYMENT_RE, clean_text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"desired_title": desired_title,
"specializations": specializations,
"employment_type": employment,
"schedule": schedule,
"parse_method": "hh_template",
}

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
if not sections:
return fallback
return sections.get(key) or fallback
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
header_text = _pick(sections, "header", clean_text)
about_text = _pick(sections, "about", clean_text)
skills_text = _pick(sections, "skills", clean_text)
exp_text = _pick(sections, "experience", clean_text)
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
name = extract_name_guess(header_text)
contacts_raw = extract_contacts(clean_text)
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
remote = extract_remote(clean_text)
english = extract_english(clean_text)
location = extract_location_best_effort(clean_text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "linkedin_template",
}

View File

@@ -0,0 +1,46 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
text = clean_text or ""
contacts_raw = extract_contacts(text)
name = extract_name_guess(text)
roles, skills = extract_roles_skills(text)
remote = extract_remote(text)
english = extract_english(text)
location = extract_location_best_effort(text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "one_page_template",
}

View File

@@ -0,0 +1,11 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.templates.one_page import parse_resume as _parse
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
out = _parse(clean_text, sections)
out["parse_method"] = "one_page_en"
return out

View File

@@ -0,0 +1,11 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.templates.one_page import parse_resume as _parse
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
out = _parse(clean_text, sections)
out["parse_method"] = "one_page_ru"
return out

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
text = clean_text or ""
contacts_raw = extract_contacts(text)
name = extract_name_guess(text)
roles, skills = extract_roles_skills(text)
remote = extract_remote(text)
english = extract_english(text)
location = extract_location_best_effort(text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "pptx_template",
}

99
extract/text_extract.py Normal file
View File

@@ -0,0 +1,99 @@
from __future__ import annotations
import os
from pathlib import Path
import logging
from bs4 import BeautifulSoup
try: # optional dependency for PDF fallback
from pypdf import PdfReader as _PdfReader # type: ignore
except Exception: # pragma: no cover - optional import
try:
from PyPDF2 import PdfReader as _PdfReader # type: ignore
except Exception: # pragma: no cover
_PdfReader = None # type: ignore
def _read_bytes(path: Path) -> bytes:
return path.read_bytes()
def extract_text_from_txt(path: Path) -> str:
data = _read_bytes(path)
for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
try:
return data.decode(enc, errors="ignore")
except Exception:
continue
return data.decode("utf-8", errors="ignore")
def extract_text_from_html(path: Path) -> str:
html = extract_text_from_txt(path)
soup = BeautifulSoup(html, "lxml")
return soup.get_text("\n", strip=True)
def extract_text_from_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
parts = []
for p in doc.paragraphs:
if p.text and p.text.strip():
parts.append(p.text.strip())
for table in doc.tables:
for row in table.rows:
cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
if cells:
parts.append(" | ".join(cells))
return "\n".join(parts)
_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
logging.getLogger("pypdf").setLevel(logging.ERROR)
logging.getLogger("PyPDF2").setLevel(logging.ERROR)
def extract_text_from_pdf(path: Path) -> str:
"""
Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
"""
if _PdfReader is None:
raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
try:
reader = _PdfReader(str(path), strict=False)
except Exception as exc: # pragma: no cover - pdf parser edge cases
raise RuntimeError(f"PDF read failed: {exc}") from exc
parts = []
for idx, page in enumerate(getattr(reader, "pages", [])):
if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
break
try:
text = page.extract_text() # type: ignore[attr-defined]
except Exception:
text = None
if text:
parts.append(text)
return "\n".join(parts)
def extract_text_from_doc_best_effort(path: Path) -> str:
# .doc requires external tools; best-effort if textract installed
try:
import textract # type: ignore
b = textract.process(str(path))
return b.decode("utf-8", errors="ignore")
except Exception:
return ""
def extract_text(path: Path) -> str:
ext = path.suffix.lower()
if ext in (".txt", ".log"):
return extract_text_from_txt(path)
if ext in (".html", ".htm"):
return extract_text_from_html(path)
if ext == ".docx":
return extract_text_from_docx(path)
if ext == ".pdf":
return extract_text_from_pdf(path)
if ext == ".doc":
return extract_text_from_doc_best_effort(path)
return ""