Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
.venv/
__pycache__/
*.pyc

2
__init__.py Normal file
View File

@@ -0,0 +1,2 @@
__all__ = []
__version__ = "1.0.0"

1184
agent.py Normal file

File diff suppressed because it is too large Load Diff

77
api.py Normal file
View File

@@ -0,0 +1,77 @@
from __future__ import annotations
import os
from typing import Any, Dict, Optional
from fastapi import FastAPI
from pydantic import BaseModel, Field
from tg_resume_db.db import connect, init_db
from tg_resume_db.agent import agent_search
from tg_resume_db.search import search as db_search
DB_PATH = os.environ.get("CANDIDATES_DB", "./candidates.db")
app = FastAPI(title="Resume Search API", version="1.0")
class SearchRequest(BaseModel):
query: str = Field(default="")
limit: int = Field(default=20, ge=1, le=100)
offset: int = Field(default=0, ge=0)
remote: Optional[bool] = None
location: Optional[str] = None
experience_min: Optional[float] = None
salary_min: Optional[int] = None
salary_max: Optional[int] = None
english: Optional[str] = None
role: Optional[str] = None
skill: Optional[str] = None
class AISearchRequest(BaseModel):
prompt: str = Field(default="")
limit: int = Field(default=20, ge=1, le=100)
ai_iters: int = Field(default=2, ge=0, le=5)
@app.on_event("startup")
def _startup():
con = connect(DB_PATH)
init_db(con)
con.close()
@app.get("/health")
def health():
return {"ok": True}
@app.post("/search")
def search(req: SearchRequest) -> Dict[str, Any]:
con = connect(DB_PATH)
try:
items = db_search(con, query=req.query, filters=req.model_dump(), limit=req.limit, offset=req.offset)
return {"items": items, "count": len(items)}
finally:
con.close()
@app.post("/search/ai")
def search_ai(req: AISearchRequest) -> Dict[str, Any]:
con = connect(DB_PATH)
try:
res = agent_search(
con,
user_prompt=req.prompt,
max_iters=req.ai_iters,
limit=req.limit,
)
return {
"ai": True,
"llm_used": res.get("llm_used", False),
"plan": res.get("plan"),
"history": res.get("history"),
"postfilter": res.get("postfilter"),
"items": res.get("items", []),
"count": int(res.get("count", 0)),
}
finally:
con.close()

267
bundle_export.py Normal file
View File

@@ -0,0 +1,267 @@
from __future__ import annotations
import json
import os
import re
import shutil
import sqlite3
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional
# NEW: PDF merge helper (pypdf)
# pip install pypdf
try:
from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили
merge_all_pdfs_in_dir = None
def _slug(s: str, max_len: int = 60) -> str:
s = (s or "").strip()
if not s:
return "candidate"
s = re.sub(r"\s+", " ", s)
s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
s = s.replace(" ", "_")
s = re.sub(r"_+", "_", s).strip("_")
if not s:
return "candidate"
return s[:max_len]
def _safe_mkdir(p: Path) -> None:
p.mkdir(parents=True, exist_ok=True)
def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
"""
Возвращает список самых приоритетных путей к файлу резюме.
1) resumes.file_path
2) sources.original_file_path
3) некоторые варианты путей из sources.extra_json
"""
paths: List[str] = []
row = con.execute(
"SELECT file_path FROM resumes WHERE resume_id=?",
(resume_id,),
).fetchone()
if row and row["file_path"]:
paths.append(str(row["file_path"]))
cur = con.execute(
"""SELECT original_file_path, original_file_name, extra_json
FROM sources
WHERE resume_id=?""",
(resume_id,),
)
for r in cur.fetchall():
ofp = r["original_file_path"]
if ofp:
paths.append(str(ofp))
try:
extra = json.loads(r["extra_json"] or "{}")
if isinstance(extra, dict):
for k in ("file_path", "path", "local_path", "source_path"):
if extra.get(k):
paths.append(str(extra[k]))
except Exception:
pass
# дедуп
seen = set()
out: List[str] = []
for p in paths:
p2 = os.path.normpath(p)
if p2 in seen:
continue
seen.add(p2)
out.append(p2)
return out
def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
ext = src.suffix.lower() if src.suffix else ""
candidate = f"{base_name}{ext}"
dst = dst_dir / candidate
if dst.exists():
i = 2
while True:
dst = dst_dir / f"{base_name}({i}){ext}"
if not dst.exists():
break
i += 1
shutil.copy2(src, dst)
return dst
def bundle_search_results(
con: sqlite3.Connection,
results: Iterable[Dict[str, Any]],
out_dir: str,
*,
copy_files: bool = True,
merge_text: bool = True,
merge_pdf: bool = True, # NEW
) -> Dict[str, Any]:
"""
results: iterable dictов где есть минимум:
- resume_id
- candidate_id
- name (желательно)
Создаёт:
- files/: скопированные исходные файлы резюме
- merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
- pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
- manifest.json
- README.txt
"""
out_root = Path(out_dir).resolve()
files_dir = out_root / "files"
_safe_mkdir(files_dir)
manifest: List[Dict[str, Any]] = []
copied = 0
missing = 0
merged_parts: List[str] = []
merged_txt_path = out_root / "merged_resumes.txt"
for item in results:
resume_id = item.get("resume_id")
cand_id = item.get("candidate_id")
name = item.get("name") or ""
if not resume_id or not cand_id:
continue
# merged TXT из БД
if merge_text:
row = con.execute(
"SELECT clean_text FROM resumes WHERE resume_id=?",
(resume_id,),
).fetchone()
clean_text = (row["clean_text"] if row else "") or ""
header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
merged_parts.append(header)
merged_parts.append(clean_text.strip())
merged_parts.append("")
if not copy_files:
continue
src_paths = _pick_source_paths(con, resume_id)
src_found: Optional[Path] = None
for sp in src_paths:
p = Path(sp)
if p.exists() and p.is_file():
src_found = p
break
if not src_found:
missing += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": False,
"reason": "source_file_not_found",
"tried_paths": src_paths,
}
)
continue
base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
try:
dst = _copy_unique(src_found, files_dir, base)
copied += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": True,
"source_path": str(src_found),
"dest_path": str(dst),
}
)
except Exception as e:
missing += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": False,
"reason": f"copy_failed: {repr(e)}",
"source_path": str(src_found),
}
)
# merged TXT
if merge_text:
merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
# NEW: merged PDF from files/*.pdf
merged_pdf_path: Optional[Path] = None
pdf_info: Optional[Dict[str, Any]] = None
if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
try:
merged_pdf_path = out_root / "pdf" / "merged.pdf"
_safe_mkdir(merged_pdf_path.parent)
pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
except Exception as e:
pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
# manifest.json
(out_root / "manifest.json").write_text(
json.dumps(
{
"out_dir": str(out_root),
"copied_files": copied,
"missing_files": missing,
"merged_text": str(merged_txt_path) if merge_text else None,
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
"pdf_info": pdf_info,
"items": manifest,
},
ensure_ascii=False,
indent=2,
),
encoding="utf-8",
errors="ignore",
)
# README
readme_lines = [
"Папка создана командой search.",
"- files/: скопированные исходные файлы резюме",
"- merged_resumes.txt: склейка текста clean_text из БД",
"- manifest.json: что откуда скопировалось / что не найдено",
]
if merge_pdf:
if merge_all_pdfs_in_dir is None:
readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
else:
readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
(out_root / "README.txt").write_text(
"\n".join(readme_lines) + "\n",
encoding="utf-8",
errors="ignore",
)
return {
"out_dir": str(out_root),
"copied_files": copied,
"missing_files": missing,
"merged_text": str(merged_txt_path) if merge_text else None,
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
"manifest": str(out_root / "manifest.json"),
"pdf_info": pdf_info,
}

282
cli.py Normal file
View File

@@ -0,0 +1,282 @@
from __future__ import annotations
import argparse
import json
import sys
from datetime import datetime
from typing import Any, Dict
from pathlib import Path
import os
from tg_resume_db.bundle_export import bundle_search_results
from tg_resume_db.db import connect, init_db
from tg_resume_db.pipeline import import_exports as run_import
from tg_resume_db.search import search as run_search
from tg_resume_db.util import Logger
from tg_resume_db.extract.text_extract import extract_text as extract_text_generic
from tg_resume_db.extract.pdf_extract import extract_pdf_best
from tg_resume_db.extract.clean import normalize_text
from tg_resume_db.extract.doc_type import detect_doc_type
from tg_resume_db.extract.sections import split_sections, sections_present
from tg_resume_db.extract.parse import extract_name_guess
def _print_json(obj: Dict[str, Any]) -> None:
s = json.dumps(obj, ensure_ascii=False, indent=2)
try:
print(s)
except UnicodeEncodeError:
# Fallback for cp1251/legacy consoles.
print(s.encode("ascii", "backslashreplace").decode("ascii"))
def _is_interactive() -> bool:
return sys.stdin.isatty() and sys.stdout.isatty()
def main() -> None:
ap = argparse.ArgumentParser(prog="tg_resume_db")
sub = ap.add_subparsers(dest="cmd", required=True)
# ---------------- import_exports ----------------
imp = sub.add_parser("import_exports", help="Import Telegram exports recursively (incremental)")
imp.add_argument("--input", required=True, help="Path to exports directory")
imp.add_argument("--db", required=True, help="SQLite db path")
imp.add_argument("--log", default="./import.log", help="Log file path")
imp.add_argument("--near-dist", type=int, default=6, help="Simhash max Hamming distance for near-duplicates")
imp.add_argument("--min-text-len", type=int, default=250, help="Skip very short texts")
imp.add_argument(
"--llm",
choices=["auto", "off", "force"],
default="auto",
help="LLM enrichment mode: auto (default), off to disable, force to always run when configured",
)
imp.add_argument(
"--llm-review",
choices=["always", "auto", "off"],
default="always",
help="LLM review mode for parsed JSON: always (default), auto, off",
)
imp.add_argument(
"--llm-review-rounds",
type=int,
default=1,
help="How many LLM review merge rounds to run per resume (1..3)",
)
# ---------------- search ----------------
s = sub.add_parser("search", help="Search candidates")
s.add_argument("--db", required=True)
s.add_argument("--query", required=True)
s.add_argument("--limit", type=int, default=20)
s.add_argument("--offset", type=int, default=0)
s.add_argument("--remote", choices=["true", "false"], default=None)
s.add_argument("--location", default=None)
s.add_argument("--experience-min", type=float, default=None)
s.add_argument("--salary-min", type=int, default=None)
s.add_argument("--salary-max", type=int, default=None)
s.add_argument("--english", default=None)
s.add_argument("--doc-type", default=None)
# AI mode
s.add_argument("--ai", action="store_true", help="Use LLM to build filters from text query and run search")
s.add_argument("--ai-iters", type=int, default=2, help="How many refine iterations for AI search")
# Backward compatible single-value filters
s.add_argument("--role", default=None, help="Single role (backward compatible); prefer --roles-any")
s.add_argument("--skill", default=None, help="Single skill (backward compatible); prefer --skills-any/--skills-all")
# Stack filters (comma-separated)
s.add_argument("--roles-any", default=None, help="Comma-separated roles; at least one must match")
s.add_argument("--skills-any", default=None, help="Comma-separated skills; at least one must match")
s.add_argument("--skills-all", default=None, help="Comma-separated skills; all must match")
# Bundle export behavior
s.add_argument("--bundle", choices=["ask", "yes", "no"], default="ask", help="Bundle found resumes into a folder")
# ---------------- inspect ----------------
ins = sub.add_parser("inspect", help="Inspect a single resume file (doc_type/sections)")
ins.add_argument("--file", required=True, help="Path to resume file")
args = ap.parse_args()
# ========================= import_exports =========================
if args.cmd == "import_exports":
con = connect(args.db)
try:
init_db(con)
log = Logger(args.log)
prev_enabled = os.environ.get("LLM_PARSE_ENABLED")
prev_force = os.environ.get("LLM_PARSE_FORCE")
prev_review_mode = os.environ.get("LLM_PARSE_REVIEW_MODE")
prev_review_rounds = os.environ.get("LLM_PARSE_REVIEW_ROUNDS")
try:
if args.llm == "off":
os.environ["LLM_PARSE_ENABLED"] = "0"
os.environ["LLM_PARSE_REVIEW_MODE"] = "off"
elif args.llm == "force":
os.environ["LLM_PARSE_ENABLED"] = "1"
os.environ["LLM_PARSE_FORCE"] = "1"
os.environ["LLM_PARSE_REVIEW_MODE"] = "always"
else:
os.environ["LLM_PARSE_REVIEW_MODE"] = args.llm_review
rounds = max(1, min(int(args.llm_review_rounds), 3))
os.environ["LLM_PARSE_REVIEW_ROUNDS"] = str(rounds)
stats = run_import(
con=con,
input_dir=args.input,
log=log,
max_near_dist=args.near_dist,
min_text_len=args.min_text_len,
)
finally:
if args.llm == "off":
if prev_enabled is None:
os.environ.pop("LLM_PARSE_ENABLED", None)
else:
os.environ["LLM_PARSE_ENABLED"] = prev_enabled
elif args.llm == "force":
if prev_enabled is None:
os.environ.pop("LLM_PARSE_ENABLED", None)
else:
os.environ["LLM_PARSE_ENABLED"] = prev_enabled
if prev_force is None:
os.environ.pop("LLM_PARSE_FORCE", None)
else:
os.environ["LLM_PARSE_FORCE"] = prev_force
if prev_review_mode is None:
os.environ.pop("LLM_PARSE_REVIEW_MODE", None)
else:
os.environ["LLM_PARSE_REVIEW_MODE"] = prev_review_mode
if prev_review_rounds is None:
os.environ.pop("LLM_PARSE_REVIEW_ROUNDS", None)
else:
os.environ["LLM_PARSE_REVIEW_ROUNDS"] = prev_review_rounds
finally:
con.close()
_print_json(stats)
return
# ============================= search =============================
if args.cmd == "search":
con = connect(args.db)
init_db(con) # важно: гарантирует, что resumes_fts и триггеры существуют
try:
items: list[Dict[str, Any]] = []
out: Dict[str, Any] = {}
if args.ai:
from tg_resume_db.agent import agent_search
res = agent_search(
con,
user_prompt=args.query,
max_iters=args.ai_iters,
)
items = res.get("items", [])
out = {
"ai": True,
"llm_used": res.get("llm_used", False),
"plan": res.get("plan"),
"history": res.get("history"),
"postfilter": res.get("postfilter"),
"items": items,
"count": res.get("count", len(items)),
}
else:
filters = {
"remote": (args.remote == "true") if args.remote is not None else None,
"location": args.location,
"experience_min": args.experience_min,
"salary_min": args.salary_min,
"salary_max": args.salary_max,
"english": args.english,
"doc_type": args.doc_type,
# backward compat
"role": args.role,
"skill": args.skill,
# new
"roles_any": args.roles_any,
"skills_any": args.skills_any,
"skills_all": args.skills_all,
}
items = run_search(
con,
query=args.query,
filters=filters,
limit=args.limit,
offset=args.offset,
)
out = {"ai": False, "items": items, "count": len(items)}
# 1) печатаем результаты
_print_json(out)
# 2) bundle prompt/flag
if args.bundle == "yes":
do_bundle = True
elif args.bundle == "no":
do_bundle = False
else: # ask
do_bundle = False
if _is_interactive():
ans = input("\nСобрать найденные резюме в папку? (Y/N): ").strip().lower()
do_bundle = ans in ("y", "yes", "да", "д")
if do_bundle:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = f"./bundle_{ts}"
info = bundle_search_results(con, items, out_dir, copy_files=True, merge_text=True)
print(f"\n[done] Готово: {info['out_dir']}")
print(f" files copied: {info['copied_files']}, missing: {info['missing_files']}")
print(f" merged: {info['merged_text']}")
print(f" manifest: {info['manifest']}")
return
finally:
con.close()
# ============================= inspect =============================
if args.cmd == "inspect":
fp = args.file
path = Path(fp)
extract_meta = {}
if path.suffix.lower() == ".pdf":
pdf_res = extract_pdf_best(path, timeout_sec=25)
raw_text = pdf_res.text
extract_meta = {
"method": pdf_res.method,
"quality_score": pdf_res.score,
"quality_flags": pdf_res.flags,
"pages": len(pdf_res.pages),
}
else:
raw_text = extract_text_generic(path)
extract_meta = {"method": "generic"}
clean = normalize_text(raw_text or "")
dt = detect_doc_type(clean, file_ext=Path(fp).suffix.lower())
secs = split_sections(clean, dt.doc_type)
out = {
"file": fp,
"doc_type": dt.doc_type,
"confidence": dt.confidence,
"signals": dt.signals,
"extract": extract_meta,
"sections_present": sections_present(secs),
"name_guess": extract_name_guess(clean),
}
_print_json(out)
return
if __name__ == "__main__":
main()

296
db.py Normal file
View File

@@ -0,0 +1,296 @@
from __future__ import annotations
import sqlite3
from pathlib import Path
SCHEMA = r"""
PRAGMA journal_mode=WAL;
PRAGMA synchronous=NORMAL;
PRAGMA temp_store=MEMORY;
CREATE TABLE IF NOT EXISTS candidates (
candidate_id TEXT PRIMARY KEY,
name TEXT,
location TEXT,
remote INTEGER,
experience_years REAL,
experience_years_eng REAL, -- инженерный опыт (после фильтра HR)
experience_confidence REAL,
salary_min INTEGER,
salary_max INTEGER,
salary_confidence REAL,
english_level TEXT,
roles_json TEXT,
skills_json TEXT,
primary_languages_json TEXT,
backend_focus INTEGER,
roles_norm TEXT, -- "|backend|devops|"
skills_norm TEXT, -- "|python|k8s|"
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS candidate_contacts (
contact_type TEXT NOT NULL, -- email/phone/tg/github/linkedin/url
contact_value TEXT NOT NULL, -- normalized
candidate_id TEXT NOT NULL,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(contact_type, contact_value),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id);
CREATE TABLE IF NOT EXISTS resumes (
resume_id TEXT PRIMARY KEY,
candidate_id TEXT NOT NULL,
sha256 TEXT,
simhash TEXT,
clean_text TEXT NOT NULL,
raw_text TEXT,
extraction_json TEXT,
llm_summary TEXT,
llm_tags_json TEXT,
extract_method TEXT,
extract_quality_score REAL,
extract_quality_flags TEXT,
extract_pages_json TEXT,
doc_type TEXT,
doc_type_confidence REAL,
parse_method TEXT,
parse_version TEXT,
sections_json TEXT,
is_active INTEGER DEFAULT 1,
duplicate_of_resume_id TEXT,
file_path TEXT,
file_mtime INTEGER,
file_size INTEGER,
created_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id);
CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active);
CREATE TABLE IF NOT EXISTS sources (
source_id INTEGER PRIMARY KEY AUTOINCREMENT,
resume_id TEXT NOT NULL,
export_path TEXT,
chat_title TEXT,
message_id TEXT,
message_date TEXT,
origin_type TEXT,
original_file_path TEXT,
original_file_name TEXT,
extra_json TEXT,
created_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
);
CREATE TABLE IF NOT EXISTS files_seen (
sha256 TEXT PRIMARY KEY,
size INTEGER,
mtime INTEGER,
canonical_resume_id TEXT,
first_seen_at TEXT DEFAULT (datetime('now')),
last_seen_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS simhash_buckets (
bucket INTEGER NOT NULL,
band INTEGER NOT NULL,
resume_id TEXT NOT NULL,
PRIMARY KEY(bucket, band, resume_id),
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
);
CREATE TABLE IF NOT EXISTS candidate_skills (
candidate_id TEXT NOT NULL,
skill_id TEXT NOT NULL,
skill_label TEXT,
confidence REAL,
source TEXT,
evidence TEXT,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(candidate_id, skill_id),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS candidate_roles (
candidate_id TEXT NOT NULL,
role TEXT NOT NULL,
confidence REAL,
source TEXT,
evidence TEXT,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(candidate_id, role),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS candidate_languages (
candidate_id TEXT NOT NULL,
language TEXT NOT NULL,
level TEXT,
confidence REAL,
source TEXT,
evidence TEXT,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(candidate_id, language),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS positions (
position_id TEXT PRIMARY KEY,
resume_id TEXT NOT NULL,
candidate_id TEXT NOT NULL,
title TEXT,
company TEXT,
date_from TEXT,
date_to TEXT,
is_current INTEGER,
description TEXT,
stack_json TEXT,
created_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS llm_cache (
cache_key TEXT PRIMARY KEY,
model TEXT,
result_json TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
-- Full-text index (FTS5): contentless
CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5(
resume_id UNINDEXED,
candidate_id UNINDEXED,
clean_text,
tokenize='unicode61 remove_diacritics 2'
);
-- --- Triggers to keep FTS synced with resumes ---
-- Insert
CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts
AFTER INSERT ON resumes
BEGIN
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
WHERE NEW.is_active = 1;
END;
-- Delete
CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts
AFTER DELETE ON resumes
BEGIN
DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id;
END;
-- Update (text/active/candidate)
CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts
AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes
BEGIN
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
WHERE NEW.is_active = 1;
END;
"""
def connect(db_path: str) -> sqlite3.Connection:
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
con = sqlite3.connect(db_path)
con.row_factory = sqlite3.Row
return con
def _table_exists(con: sqlite3.Connection, name: str) -> bool:
row = con.execute(
"SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1",
(name,),
).fetchone()
return row is not None
def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool:
cur = con.execute(f"PRAGMA table_info({table})")
for r in cur.fetchall():
if r["name"] == column:
return True
return False
def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None:
if not _table_exists(con, table):
return
if _column_exists(con, table, column):
return
con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}")
def _ensure_fts_backfilled(con: sqlite3.Connection) -> None:
"""
Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes.
Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0.
"""
if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"):
return
try:
resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"])
fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"])
except Exception:
return
if resumes_cnt <= 0:
return
# Любое несовпадение -> rebuild (убирает и пустоту, и дубли)
if fts_cnt != resumes_cnt:
con.execute("DELETE FROM resumes_fts")
con.execute(
"""
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
SELECT resume_id, candidate_id, clean_text
FROM resumes
WHERE is_active=1
"""
)
con.commit()
def init_db(con: sqlite3.Connection) -> None:
con.executescript(SCHEMA)
# Lightweight migrations for existing DBs (safe to re-run)
_add_column_if_missing(con, "candidates", "experience_years_eng", "REAL")
_add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT")
_add_column_if_missing(con, "candidates", "backend_focus", "INTEGER")
_add_column_if_missing(con, "resumes", "llm_summary", "TEXT")
_add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT")
_add_column_if_missing(con, "resumes", "extract_method", "TEXT")
_add_column_if_missing(con, "resumes", "extract_quality_score", "REAL")
_add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT")
_add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT")
_add_column_if_missing(con, "resumes", "doc_type", "TEXT")
_add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL")
_add_column_if_missing(con, "resumes", "parse_method", "TEXT")
_add_column_if_missing(con, "resumes", "parse_version", "TEXT")
_add_column_if_missing(con, "resumes", "sections_json", "TEXT")
if not _table_exists(con, "llm_cache"):
con.execute(
"""
CREATE TABLE IF NOT EXISTS llm_cache (
cache_key TEXT PRIMARY KEY,
model TEXT,
result_json TEXT,
created_at TEXT DEFAULT (datetime('now'))
)
"""
)
con.commit()
_ensure_fts_backfilled(con)

41
dedup/simhash.py Normal file
View File

@@ -0,0 +1,41 @@
from __future__ import annotations
import hashlib
import re
from typing import List, Tuple
def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str:
h = hashlib.sha256()
for chunk in iter(lambda: f.read(chunk_size), b""):
h.update(chunk)
return h.hexdigest()
def sha256_file(path) -> str:
with open(path, "rb") as f:
return sha256_file_bytes_iter(f)
def sha1_str(s: str) -> str:
return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()
def simhash64(text: str) -> int:
tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower())
if not tokens:
return 0
v = [0] * 64
for tok in tokens:
h = hashlib.md5(tok.encode("utf-8")).digest()
x = int.from_bytes(h[:8], "big", signed=False)
for i in range(64):
v[i] += 1 if ((x >> i) & 1) else -1
out = 0
for i in range(64):
if v[i] > 0:
out |= (1 << i)
return out
def hamming64(a: int, b: int) -> int:
return (a ^ b).bit_count()
def simhash_bands(x: int) -> List[Tuple[int, int]]:
# 4 bands x 16 bits
return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]

39
extract/clean.py Normal file
View File

@@ -0,0 +1,39 @@
from __future__ import annotations
import re
from collections import Counter
import unicodedata
RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I)
RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U)
RE_MULTI_SPACE = re.compile(r"[ \t]+")
RE_MULTI_NL = re.compile(r"\n{3,}")
_INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"]
_BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]")
def normalize_text(raw: str) -> str:
text = raw.replace("\r\n", "\n").replace("\r", "\n")
for ch in _INVISIBLE_CHARS:
text = text.replace(ch, "")
text = _BIDI_CTRL_RE.sub("", text)
# remove most control/format chars but keep line breaks and tabs
text = "".join(
ch for ch in text
if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C")
)
text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b"))
lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")]
lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)]
counts = Counter(lines)
filtered = []
for ln in lines:
if counts[ln] >= 4 and len(ln) <= 90:
continue
filtered.append(ln)
text = "\n".join(filtered)
text = RE_MULTI_NL.sub("\n\n", text).strip()
return text
def to_fts_text(clean: str) -> str:
return re.sub(r"\s+", " ", clean).strip()

134
extract/doc_type.py Normal file
View File

@@ -0,0 +1,134 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class DocTypeResult:
doc_type: str
confidence: float
signals: List[str]
_HH_PATTERNS = [
(re.compile(r"\bhh\.ru\b", re.I), 2.0, "hh_domain"),
(re.compile(r"\bheadhunter\b", re.I), 2.0, "headhunter"),
(re.compile(r"\bрезюме\s+обновлено\b", re.I), 2.5, "hh_resume_updated"),
(re.compile(r"\елаемая\s+должность\b", re.I), 1.2, "hh_desired_role"),
(re.compile(r"\bключевые\s+навыки\b", re.I), 1.0, "hh_key_skills"),
(re.compile(r"\bопыт\s+работы\b", re.I), 0.8, "hh_experience"),
]
_LI_PATTERNS = [
(re.compile(r"\blinkedin\b", re.I), 2.5, "li_brand"),
(re.compile(r"\blinkedin\.com\b", re.I), 2.0, "li_domain"),
(re.compile(r"\bexperience\b", re.I), 0.9, "li_experience"),
(re.compile(r"\beducation\b", re.I), 0.9, "li_education"),
(re.compile(r"\bskills\b", re.I), 0.9, "li_skills"),
(re.compile(r"\babout\b", re.I), 0.6, "li_about"),
]
_PPTX_PATTERNS = [
(re.compile(r"\bslide\b", re.I), 1.0, "pptx_slide"),
(re.compile(r"\bpowerpoint\b", re.I), 1.3, "pptx_powerpoint"),
(re.compile(r"\bpptx\b", re.I), 1.3, "pptx_ext"),
(re.compile(r"\bpresentation\b", re.I), 0.8, "pptx_presentation"),
]
def _score_patterns(text: str, patterns) -> tuple[float, List[str]]:
score = 0.0
signals: List[str] = []
for rx, weight, name in patterns:
if rx.search(text):
score += weight
signals.append(name)
return score, signals
def _confidence_from_score(score: float) -> float:
if score >= 4.0:
return 0.92
if score >= 3.0:
return 0.85
if score >= 2.0:
return 0.75
if score >= 1.2:
return 0.62
if score > 0.0:
return 0.50
return 0.30
def detect_doc_type(clean_text: str, file_ext: Optional[str] = None) -> DocTypeResult:
lines = [ln.strip() for ln in (clean_text or "").splitlines() if ln.strip()]
head_lines = lines[:80]
head_text = "\n".join(head_lines)
head_lc = head_text.lower()
signals: List[str] = []
hh_score, hh_signals = _score_patterns(head_text, _HH_PATTERNS)
li_score, li_signals = _score_patterns(head_text, _LI_PATTERNS)
pptx_score, pptx_signals = _score_patterns(head_text, _PPTX_PATTERNS)
if file_ext and file_ext.lower() in (".pptx", ".ppt"):
pptx_score += 2.0
signals.append("pptx_ext")
signals.extend(hh_signals + li_signals + pptx_signals)
# One-page heuristic: short resumes with dense content
total_len = len(clean_text or "")
one_page_score = 0.0
if len(lines) <= 70 and total_len <= 4500:
one_page_score = 2.2
signals.append("one_page_short")
elif len(lines) <= 90 and total_len <= 6500:
one_page_score = 1.6
signals.append("one_page_medium")
# Scan heuristic: very low textual content
letters = sum(ch.isalpha() for ch in clean_text or "")
total = max(1, len(clean_text or ""))
letter_ratio = letters / total
scan_score = 0.0
if total_len < 200 or letter_ratio < 0.12:
scan_score = 3.2
signals.append("scan_low_text")
if file_ext and file_ext.lower() in (".pdf", ".png", ".jpg", ".jpeg", ".tiff"):
scan_score += 0.6
signals.append("scan_file_ext")
candidates = [
("hh_ru", hh_score),
("linkedin_pdf", li_score),
("pptx_export", pptx_score),
("one_page", one_page_score),
("scan_pdf", scan_score),
]
doc_type, best_score = max(candidates, key=lambda x: x[1])
if best_score <= 0.0:
base = "generic_pdf" if (file_ext and file_ext.lower() == ".pdf") else "generic"
return DocTypeResult(doc_type=base, confidence=0.35, signals=signals)
confidence = _confidence_from_score(best_score)
# If scan is detected strongly, prefer it
if doc_type == "scan_pdf" and confidence >= 0.8:
return DocTypeResult(doc_type="scan_pdf", confidence=confidence, signals=signals)
# Split one-page into ru/en
if doc_type == "one_page":
if _looks_cyrillic(head_text):
return DocTypeResult(doc_type="one_page_ru", confidence=confidence, signals=signals)
return DocTypeResult(doc_type="one_page_en", confidence=confidence, signals=signals)
return DocTypeResult(doc_type=doc_type, confidence=confidence, signals=signals)
def _looks_cyrillic(text: str) -> bool:
cyr = len(re.findall(r"[А-Яа-яЁё]", text))
lat = len(re.findall(r"[A-Za-z]", text))
return cyr > lat and cyr >= 10

159
extract/experience.py Normal file
View File

@@ -0,0 +1,159 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from datetime import date
from typing import Dict, List, Optional, Tuple
# Month maps (EN + RU)
MONTHS = {
"jan": 1, "january": 1, "янв": 1, "январ": 1,
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
"mar": 3, "march": 3, "мар": 3, "март": 3,
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
"may": 5, "май": 5,
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
"jul": 7, "july": 7, "июл": 7, "июль": 7,
"aug": 8, "august": 8, "авг": 8, "август": 8,
"sep": 9, "september": 9, "сен": 9, "сент": 9,
"oct": 10, "october": 10, "окт": 10, "октя": 10,
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
"dec": 12, "december": 12, "дек": 12, "дека": 12,
}
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\\.|по настоящее)\b", re.I)
# Direct "X years" patterns
DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
# Dates like 03.2019, 2019, Jan 2020, янв 2020
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
# Range separators
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—||-|to|по)\s*(?P<b>.+?)$", re.I)
@dataclass
class ExpResult:
years: Optional[float]
confidence: float
debug: Dict
def _clamp_years(y: float) -> Optional[float]:
if 0.0 <= y <= 45.0:
return y
return None
def _parse_mon(mon: str) -> Optional[int]:
m = mon.strip().lower()
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
# allow prefixes: "январ", "феврал"
for k, v in MONTHS.items():
if m.startswith(k):
return v
return None
def _as_ymd(y: int, m: int) -> date:
return date(y, m, 1)
def _parse_one_date(s: str) -> Optional[date]:
s = s.strip()
if PRESENT_RE.search(s):
today = date.today()
return date(today.year, today.month, 1)
m1 = MMYYYY_RE.search(s)
if m1:
mm = int(m1.group(1))
yy = int(m1.group(2))
return _as_ymd(yy, mm)
m2 = MON_YYYY_RE.search(s)
if m2:
mon = _parse_mon(m2.group(1))
yy = int(m2.group(2))
if mon:
return _as_ymd(yy, mon)
m3 = YYYY_RE.search(s)
if m3:
yy = int(m3.group(1))
return _as_ymd(yy, 1)
return None
def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
if not intervals:
return []
intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
merged = [intervals[0]]
for s, e in intervals[1:]:
ls, le = merged[-1]
if s <= le:
merged[-1] = (ls, max(le, e))
else:
merged.append((s, e))
return merged
def _months_between(a: date, b: date) -> int:
# month-level difference (inclusive-ish): b >= a
return (b.year - a.year) * 12 + (b.month - a.month)
def extract_experience(text: str) -> ExpResult:
debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
# 1) Direct years
directs = []
for m in DIRECT_YEARS_RE.finditer(text):
try:
v = float(m.group(1).replace(",", "."))
if 0 <= v <= 45:
directs.append(v)
debug["direct_matches"].append({"match": m.group(0), "value": v})
except Exception:
pass
if directs:
years = _clamp_years(max(directs))
return ExpResult(years=years, confidence=0.90, debug=debug)
# 2) Ranges in lines: try to detect "start - end"
intervals: List[Tuple[date, date]] = []
for line in text.splitlines():
ln = line.strip()
if len(ln) < 7:
continue
# require range separator
if not any(x in ln for x in ("", "", "-", " to ", " по ")):
continue
rr = RANGE_RE.match(ln)
if not rr:
continue
a = rr.group("a")
b = rr.group("b")
da = _parse_one_date(a)
db = _parse_one_date(b)
if da and db:
if db < da:
da, db = db, da
# cap extremely old
if da.year < 1990:
continue
intervals.append((da, db))
debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
intervals = _merge_intervals(intervals)
debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
if not intervals:
return ExpResult(years=None, confidence=0.0, debug=debug)
total_months = 0
for s, e in intervals:
total_months += max(0, _months_between(s, e))
years = round(total_months / 12.0, 2)
years = _clamp_years(years) if years is not None else None
# confidence depends on amount of evidence
conf = 0.70 if total_months >= 12 else 0.55
return ExpResult(years=years, confidence=conf, debug=debug)

View File

@@ -0,0 +1,144 @@
from __future__ import annotations
import re
from dataclasses import dataclass, asdict
from datetime import date
from typing import List, Optional
MONTHS = {
"jan": 1, "january": 1, "янв": 1, "январ": 1,
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
"mar": 3, "march": 3, "мар": 3, "март": 3,
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
"may": 5, "май": 5,
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
"jul": 7, "july": 7, "июл": 7, "июль": 7,
"aug": 8, "august": 8, "авг": 8, "август": 8,
"sep": 9, "september": 9, "сен": 9, "сент": 9,
"oct": 10, "october": 10, "окт": 10, "октя": 10,
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
"dec": 12, "december": 12, "дек": 12, "дека": 12,
}
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\\.|по настоящее)\b", re.I)
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—||-|to|по)\s*(?P<b>.+?)$", re.I)
YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—||-|to|по)\s*\d{4}\s*$", re.I)
EDU_CONTEXT_RE = re.compile(
r"\b("
r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
r")\b",
re.I,
)
@dataclass
class Position:
title: Optional[str]
company: Optional[str]
date_from: Optional[str]
date_to: Optional[str]
is_current: Optional[bool]
description: Optional[str]
def _parse_mon(mon: str) -> Optional[int]:
m = mon.strip().lower()
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
for k, v in MONTHS.items():
if m.startswith(k):
return v
return None
def _as_ymd(y: int, m: int) -> date:
return date(y, m, 1)
def _parse_one_date(s: str) -> Optional[date]:
s = s.strip()
if PRESENT_RE.search(s):
today = date.today()
return date(today.year, today.month, 1)
m1 = MMYYYY_RE.search(s)
if m1:
mm = int(m1.group(1))
yy = int(m1.group(2))
return _as_ymd(yy, mm)
m2 = MON_YYYY_RE.search(s)
if m2:
mon = _parse_mon(m2.group(1))
yy = int(m2.group(2))
if mon:
return _as_ymd(yy, mon)
m3 = YYYY_RE.search(s)
if m3:
yy = int(m3.group(1))
return _as_ymd(yy, 1)
return None
def extract_positions(text: str, max_items: int = 40) -> List[Position]:
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
positions: List[Position] = []
i = 0
while i < len(lines) and len(positions) < max_items:
ln = lines[i]
if not any(x in ln for x in ("", "", "-", " to ", " по ")):
i += 1
continue
rr = RANGE_RE.match(ln)
if not rr:
i += 1
continue
ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
i += 1
continue
da = _parse_one_date(rr.group("a"))
db = _parse_one_date(rr.group("b"))
if not da or not db:
i += 1
continue
if da.year < 1990:
i += 1
continue
is_current = PRESENT_RE.search(rr.group("b")) is not None
title = None
company = None
desc_lines: List[str] = []
if i + 1 < len(lines):
if EDU_CONTEXT_RE.search(lines[i + 1]):
i += 1
continue
header = lines[i + 1]
parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
if parts:
title = parts[0]
if len(parts) > 1:
company = parts[1]
j = i + 2
while j < len(lines):
if any(x in lines[j] for x in ("", "", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
break
desc_lines.append(lines[j])
j += 1
positions.append(
Position(
title=title,
company=company,
date_from=da.isoformat(),
date_to=db.isoformat(),
is_current=is_current,
description="\n".join(desc_lines).strip() if desc_lines else None,
)
)
i = j
return positions
def positions_to_dicts(items: List[Position]) -> List[dict]:
return [asdict(p) for p in items]

585
extract/llm.py Normal file
View File

@@ -0,0 +1,585 @@
from __future__ import annotations
import hashlib
import json
import os
import re
import sqlite3
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
try:
import httpx # type: ignore
except Exception: # pragma: no cover
httpx = None # type: ignore
def resolve_llm_runtime() -> Dict[str, str]:
"""
Resolve OpenAI-compatible runtime config.
Supports both generic vars and Mistral aliases:
- generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
- mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
"""
provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
model = (os.environ.get("LLM_MODEL") or "").strip()
api_key = (os.environ.get("LLM_API_KEY") or "").strip()
mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()
if not api_key and mistral_key:
api_key = mistral_key
if not model and mistral_model:
model = mistral_model
if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
base_url = mistral_base
if base_url:
base_url = base_url.rstrip("/")
if not provider:
if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
provider = "mistral"
else:
provider = "generic"
return {
"provider": provider,
"base_url": base_url,
"model": model,
"api_key": api_key,
}
# ------------- Public API -------------
def llm_parse_enabled() -> bool:
"""
Enabled only if httpx is available and both base_url/model are resolved.
Opt-out via LLM_PARSE_ENABLED=0.
"""
if httpx is None:
return False
if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
return False
runtime = resolve_llm_runtime()
return bool(runtime["base_url"]) and bool(runtime["model"])
_PROMPT_VERSION = "v3_sections_doc_type"
_REVIEW_PROMPT_VERSION = "v1_review_merge"
@dataclass
class LLMExtraction:
roles: List[str]
skills: List[str]
primary_languages: List[str]
seniority: Optional[str]
backend_focus: Optional[bool]
experience_years_total: Optional[float]
experience_years_engineering: Optional[float]
english_level: Optional[str]
location: Optional[str]
remote_ok: Optional[bool]
salary_min_usd: Optional[int]
salary_max_usd: Optional[int]
salary_min_rub: Optional[int]
salary_max_rub: Optional[int]
highlights: List[str]
keywords: List[str]
@staticmethod
def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
def _as_list(v: Any) -> List[str]:
if v is None:
return []
if isinstance(v, list):
return [str(x).strip() for x in v if str(x).strip()]
s = str(v).strip()
return [s] if s else []
def _as_float(v: Any) -> Optional[float]:
try:
return float(v)
except Exception:
return None
def _as_int(v: Any) -> Optional[int]:
try:
return int(float(v))
except Exception:
return None
def _as_bool(v: Any) -> Optional[bool]:
if isinstance(v, bool):
return v
if v is None:
return None
s = str(v).strip().lower()
if s in ("true", "1", "yes", "y"):
return True
if s in ("false", "0", "no", "n"):
return False
return None
return LLMExtraction(
roles=_as_list(obj.get("roles")),
skills=_as_list(obj.get("skills")),
primary_languages=_as_list(obj.get("primary_languages")),
seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
backend_focus=_as_bool(obj.get("backend_focus")),
experience_years_total=_as_float(obj.get("experience_years_total")),
experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
remote_ok=_as_bool(obj.get("remote_ok")),
salary_min_usd=_as_int(obj.get("salary_min_usd")),
salary_max_usd=_as_int(obj.get("salary_max_usd")),
salary_min_rub=_as_int(obj.get("salary_min_rub")),
salary_max_rub=_as_int(obj.get("salary_max_rub")),
highlights=_as_list(obj.get("highlights")),
keywords=_as_list(obj.get("keywords")),
)
def llm_extract_profile(
clean_text: str,
*,
con: Optional[sqlite3.Connection] = None,
doc_type: Optional[str] = None,
sections: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
"""
Returns (LLMExtraction | None, debug_info).
- Uses cache on disk/sqlite to keep throughput high.
- Silently degrades to None on any failure.
"""
runtime = resolve_llm_runtime()
dbg: Dict[str, Any] = {
"enabled": llm_parse_enabled(),
"provider": runtime.get("provider"),
"model": runtime.get("model"),
"from_cache": False,
"cache_backend": None,
"error": None,
"prompt_version": _PROMPT_VERSION,
}
if not llm_parse_enabled():
return None, dbg
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"
payload = _build_payload(
clean_text,
doc_type=doc_type,
sections=sections,
prompt_version=_PROMPT_VERSION,
temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
system_prompt="You output ONLY JSON for structured resume extraction.",
prompt_template=_PROMPT_TEMPLATE,
)
data = _cached_llm_json_call(
con=con,
cache_key=cache_key,
model=runtime["model"],
payload=payload,
dbg=dbg,
)
if data is None:
return None, dbg
return LLMExtraction.from_obj(data), dbg
def llm_review_profile(
clean_text: str,
*,
draft: Dict[str, Any],
con: Optional[sqlite3.Connection] = None,
doc_type: Optional[str] = None,
sections: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
"""
Second-pass validator:
- Takes already parsed JSON (draft)
- Re-checks every field against resume text
- Returns corrected extraction for safe merge in pipeline
"""
runtime = resolve_llm_runtime()
dbg: Dict[str, Any] = {
"enabled": llm_parse_enabled(),
"provider": runtime.get("provider"),
"model": runtime.get("model"),
"from_cache": False,
"cache_backend": None,
"error": None,
"prompt_version": _REVIEW_PROMPT_VERSION,
"quality_score": None,
"changed_fields": [],
"issues_found": [],
}
if not llm_parse_enabled():
return None, dbg
clean_draft = _sanitize_review_draft(draft)
draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"
payload = _build_payload(
clean_text,
doc_type=doc_type,
sections=sections,
prompt_version=_REVIEW_PROMPT_VERSION,
temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
system_prompt="You output ONLY JSON for resume parsing quality review.",
prompt_template=_REVIEW_PROMPT_TEMPLATE,
extra_vars={"draft_json": draft_blob},
)
data = _cached_llm_json_call(
con=con,
cache_key=cache_key,
model=runtime["model"],
payload=payload,
dbg=dbg,
)
if data is None:
return None, dbg
corrected_obj: Dict[str, Any]
if isinstance(data.get("corrected"), dict):
corrected_obj = data["corrected"]
else:
corrected_obj = data
dbg["quality_score"] = _as_float(data.get("quality_score"))
dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
dbg["issues_found"] = _as_str_list(data.get("issues_found"))
return LLMExtraction.from_obj(corrected_obj), dbg
# ------------- Internal helpers -------------
_PROMPT_TEMPLATE = """
Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
Схема:
{{
"roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
"skills": ["python","go","k8s","postgres","react", "..."],
"primary_languages": ["python","go","java","c++", "..."],
"seniority": "intern|junior|middle|senior|lead|principal|null",
"backend_focus": true|false|null,
"experience_years_total": number|null,
"experience_years_engineering": number|null,
"english_level": "A1|A2|B1|B2|C1|C2|null",
"location": "city, country|null",
"remote_ok": true|false|null,
"salary_min_usd": int|null,
"salary_max_usd": int|null,
"salary_min_rub": int|null,
"salary_max_rub": int|null,
"highlights": ["кратко достижения (1-2 предложения)"],
"keywords": ["уникальные ключевые слова, продукты или домены"]
}}
Не включай контактные данные в skills/keywords.
Detected doc_type: {doc_type}
Sections (if present):
{sections_block}
Full text snippet (use only if needed):
```TEXT
{resume_text}
```
"""
_REVIEW_PROMPT_TEMPLATE = """
Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.
Верни JSON строго такой формы:
{{
"corrected": {{
"roles": ["..."],
"skills": ["..."],
"primary_languages": ["..."],
"seniority": "intern|junior|middle|senior|lead|principal|null",
"backend_focus": true|false|null,
"experience_years_total": number|null,
"experience_years_engineering": number|null,
"english_level": "A1|A2|B1|B2|C1|C2|null",
"location": "city, country|null",
"remote_ok": true|false|null,
"salary_min_usd": int|null,
"salary_max_usd": int|null,
"salary_min_rub": int|null,
"salary_max_rub": int|null,
"highlights": ["..."],
"keywords": ["..."]
}},
"changed_fields": ["field_name", "..."],
"issues_found": ["кратко что было неверно/сомнительно", "..."],
"quality_score": 0.0
}}
Черновик JSON:
```DRAFT
{draft_json}
```
Detected doc_type: {doc_type}
Sections (if present):
{sections_block}
Full text snippet (use only if needed):
```TEXT
{resume_text}
```
"""
def _trim_text(text: str, max_len: int = 9000) -> str:
"""
Keep head and tail to preserve summary + recent projects.
"""
if len(text) <= max_len:
return text
head = text[: max_len // 2]
tail = text[-max_len // 2 :]
return head + "\n...\n" + tail
def _build_payload(
clean_text: str,
*,
doc_type: Optional[str],
sections: Optional[Dict[str, str]],
prompt_version: str,
temperature: float,
max_tokens: int,
system_prompt: str,
prompt_template: str,
extra_vars: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
runtime = resolve_llm_runtime()
base_url = runtime["base_url"]
model = runtime["model"]
sections_block = _build_sections_block(sections)
tpl_vars = {
"resume_text": _trim_text(clean_text),
"doc_type": (doc_type or "unknown"),
"sections_block": sections_block or "(no sections detected)",
}
if extra_vars:
tpl_vars.update(extra_vars)
prompt = prompt_template.format(**tpl_vars)
return {
"base_url": base_url,
"model": model,
"prompt_version": prompt_version,
"payload": {
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"temperature": temperature,
"max_tokens": max_tokens,
},
"headers": _build_headers(runtime),
"timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
}
def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
headers = {"Content-Type": "application/json"}
api_key = runtime.get("api_key", "")
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
return headers
def _cached_llm_json_call(
*,
con: Optional[sqlite3.Connection],
cache_key: str,
model: str,
payload: Dict[str, Any],
dbg: Dict[str, Any],
) -> Optional[Dict[str, Any]]:
data = _cache_get_sqlite(con, cache_key)
if data:
dbg["from_cache"] = True
dbg["cache_backend"] = "sqlite"
return data
cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
cache_ok = True
try:
cache_dir.mkdir(parents=True, exist_ok=True)
except Exception:
cache_ok = False
safe_name = cache_key.replace(":", "_")
cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None
if cache_path and cache_path.exists():
try:
data = json.loads(cache_path.read_text(encoding="utf-8"))
dbg["from_cache"] = True
dbg["cache_backend"] = "disk"
return data
except Exception:
pass
try:
data = _llm_call_json(payload)
if con:
_cache_put_sqlite(con, cache_key, model, data)
if cache_path:
cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
return data
except Exception as e: # pragma: no cover - network/LLM failures
dbg["error"] = repr(e)
return None
def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
if httpx is None:
raise RuntimeError("httpx is not installed")
base_url: str = task["base_url"]
payload: Dict[str, Any] = task["payload"]
timeout = float(task.get("timeout", 18.0))
with httpx.Client(timeout=timeout) as client:
r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
r.raise_for_status()
data = r.json()
content = data["choices"][0]["message"]["content"]
if isinstance(content, list):
parts = []
for block in content:
if isinstance(block, dict):
parts.append(str(block.get("text") or ""))
else:
parts.append(str(block))
content = "\n".join(parts)
content = str(content)
m = re.search(r"\{.*\}", content, flags=re.S)
if not m:
raise ValueError("LLM did not return JSON")
return json.loads(m.group(0))
def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
if not sections:
return ""
parts: List[str] = []
order = [
("about", "ABOUT"),
("skills", "SKILLS"),
("experience", "EXPERIENCE"),
("education", "EDUCATION"),
("contacts", "CONTACTS"),
]
for key, label in order:
text = sections.get(key)
if not text:
continue
snippet = _trim_text(text, max_len=1800)
parts.append(f"[{label}]\n{snippet}")
return "\n\n".join(parts)
def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
if not isinstance(draft, dict):
draft = {}
allowed = {
"roles",
"skills",
"primary_languages",
"seniority",
"backend_focus",
"experience_years_total",
"experience_years_engineering",
"english_level",
"location",
"remote_ok",
"salary_min_usd",
"salary_max_usd",
"salary_min_rub",
"salary_max_rub",
"highlights",
"keywords",
}
cleaned = {k: v for k, v in draft.items() if k in allowed}
return asdict(LLMExtraction.from_obj(cleaned))
def _as_float(v: Any) -> Optional[float]:
try:
x = float(v)
except Exception:
return None
if x < 0:
return None
if x > 1.0:
return 1.0
return x
def _as_str_list(v: Any) -> List[str]:
if v is None:
return []
if isinstance(v, list):
return [str(x).strip() for x in v if str(x).strip()]
s = str(v).strip()
return [s] if s else []
def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
if con is None:
return None
try:
row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
if row and row["result_json"]:
return json.loads(row["result_json"])
except Exception:
return None
return None
def _cache_put_sqlite(
con: Optional[sqlite3.Connection],
cache_key: str,
model: str,
data: Dict[str, Any],
) -> None:
if con is None:
return
try:
con.execute(
"INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
(cache_key, model, json.dumps(data, ensure_ascii=False)),
)
except Exception:
return

659
extract/parse.py Normal file
View File

@@ -0,0 +1,659 @@
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from tg_resume_db.normalize import normalize_skill
from tg_resume_db.extract.experience import extract_experience
EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
EMAIL_SPLIT_RE = re.compile(
r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
re.I,
)
PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
EN_TEXT_RE = re.compile(
r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
re.I,
)
EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
REMOTE_RE = re.compile(
r"\b("
r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
r")\b",
re.I,
)
# Salary (rough)
CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
SALARY_HINT_RE = re.compile(
r"\b("
r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
r")\b",
re.I,
)
PAY_TOKEN_RE = re.compile(
r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
re.I,
)
SALARY_NOISE_RE = re.compile(
r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
r"companies?|followers?|downloads?|clients?)\b",
re.I,
)
SECTION_HEADER_RE = re.compile(
r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
re.I,
)
LOCATION_CITY_COUNTRY_RE = re.compile(
r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
)
# --- SKILLS & ROLES ---
SKILLS = {
"python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
"sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
"aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
"pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
}
_SKILL_ALIASES: Dict[str, List[str]] = {
"javascript": ["java script", "java-script", "js"],
"typescript": ["type script", "type-script", "ts"],
"postgresql": ["postgres", "postgre sql", "postgre-sql"],
"graphql": ["graph ql"],
"grpc": ["g rpc"],
}
def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
patterns: List[Tuple[str, re.Pattern]] = []
for skill in sorted(SKILLS):
aliases = [skill] + _SKILL_ALIASES.get(skill, [])
for alias in aliases:
if skill == "java" and alias == "java":
# Do not match "java" inside "java script".
pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
else:
pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
patterns.append((skill, pat))
return patterns
_SKILL_PATTERNS = _build_skill_patterns()
ROLES = {
"backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
"mobile","android","ios","team lead","tech lead","architect"
}
_ROLE_ALIASES: Dict[str, List[str]] = {
"backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
"frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
"fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
"devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
"qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
"sre": ["sre", "site reliability"],
"data engineer": ["data engineer"],
"data scientist": ["data scientist"],
"ml engineer": ["ml engineer", "machine learning engineer"],
"mobile": ["mobile developer", "mobile engineer"],
"android": ["android developer", "android engineer"],
"ios": ["ios developer", "ios engineer"],
"team lead": ["team lead", "teamlead"],
"tech lead": ["tech lead", "techlead"],
"architect": ["architect", "solution architect", "software architect"],
}
def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
out: Dict[str, List[re.Pattern]] = {}
for role in ROLES:
aliases = _ROLE_ALIASES.get(role, [role])
out[role] = [
re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
for a in aliases
]
return out
_ROLE_PATTERNS = _build_role_patterns()
# --- HR / RECRUITER FILTERS ---
# Words that indicate the line is about searching for candidates, not owning the skill.
HR_CONTEXT_RE = re.compile(
r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
re.I
)
# Roles that explicitly define the person as Non-Engineering
NON_TECH_ROLES_RE = re.compile(
r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
re.I
)
# --- EXPERIENCE ---
AGE_LINE_RE = re.compile(
r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
)
EXP_HEADER_RE = re.compile(
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
)
# "5 years 10 months"
EXP_SUMMARY_RE = re.compile(
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
r"[^0-9]{0,20}"
r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
)
EXP_NEARBY_RE = re.compile(
r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
)
HH_FOOTER_RE = re.compile(
r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
re.I,
)
NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
NAME_LINE_RE = re.compile(
r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
)
NAME_STOPWORDS = {
"resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
"projects", "about", "profile", "objective", "навыки", "опыт", "образование",
"контакты", "профиль", "цель", "резюме",
"developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
"backend developer", "frontend developer", "fullstack developer", "software engineer",
"разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
"top skills", "experience", "education", "languages", "certifications",
"skills & endorsements", "endorsements",
"university", "state university", "institute", "college", "academy", "school",
"bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
"колледж", "школа", "бакалавр", "магистр", "факультет",
}
_NAME_BAD_WORDS = {
"skills", "top skills", "experience", "education", "languages", "certifications",
"projects", "summary", "about", "profile", "endorsements",
"university", "institute", "college", "academy", "school",
"bachelor", "master", "degree", "faculty",
}
NAME_INSTITUTION_RE = re.compile(
r"\b("
r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
r"mathematics|computer science|informatics|physics|economics|management|"
r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
r"математик|информатик|физик|экономик|менеджмент"
r")\b",
re.I,
)
_EMAIL_PREFIX_STOP = {
"email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
}
def _prune_fragment_emails(values: List[str]) -> List[str]:
uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
out: List[str] = []
for e in uniq:
local, domain = e.split("@", 1)
drop = False
for other in uniq:
if other == e:
continue
ol, od = other.split("@", 1)
if od != domain:
continue
if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
drop = True
break
if not drop:
out.append(e)
return out
def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
"""
Returns (total_years, engineering_years, confidence, debug).
Logic:
1. Calculate TOTAL experience from summaries.
2. Check if the candidate is primarily a Recruiter/HR.
- If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
- If NO: engineering_years = total_years (Optimistic assumption for valid devs).
"""
dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
total_years: Optional[float] = None
confidence = 0.0
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
# 1. Detect if Recruiter
# Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
header_text = "\n".join(lines[:15])
is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
dbg["is_recruiter"] = is_recruiter
# 2. Extract Total Duration
if lines:
# Strategy A: Explicit summary
for i, ln in enumerate(lines[:200]):
if AGE_LINE_RE.search(ln): continue
# Look for summary line
if EXP_HEADER_RE.search(ln):
window = ln
if i + 1 < len(lines): window += " " + lines[i+1]
if i + 2 < len(lines): window += " " + lines[i+2]
m = EXP_SUMMARY_RE.search(window)
if m:
y = int(m.group("y"))
mm = int(m.group("m")) if m.group("m") else 0
total_years = float(round(y + (mm / 12.0), 2))
if 0 <= total_years <= 60:
dbg["method"] = "summary"
dbg["matched"] = m.group(0)
confidence = 0.95
break
# Strategy B: Fallback nearby
if total_years is None:
safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
for i, ln in enumerate(safe_lines):
if not EXP_HEADER_RE.search(ln): continue
chunk = " ".join(safe_lines[i : i + 12])
m = EXP_NEARBY_RE.search(chunk)
if m:
y = int(m.group("y"))
mm = int(m.group("m")) if m.group("m") else 0
val = float(round(y + (mm / 12.0), 2))
if 0 <= val <= 60:
total_years = val
dbg["method"] = "header_chunk"
dbg["matched"] = m.group(0)
confidence = 0.80
break
# 2.5 Timeline/range fallback-reconciliation
# Protects against cases where summary parser catches one short fragment
# while CV has a long timeline.
try:
alt = extract_experience(text or "")
except Exception:
alt = None
if alt and alt.years is not None:
if total_years is None:
total_years = alt.years
confidence = max(confidence, alt.confidence)
dbg["method"] = "timeline_fallback"
dbg["matched"] = "date_ranges"
elif alt.years > (total_years + 1.0):
strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
if strong_summary and (alt.years - float(total_years)) > 1.5:
dbg["reconcile"] = "timeline_skip_strong_summary"
else:
total_years = alt.years
confidence = max(confidence, min(0.82, alt.confidence))
dbg["method"] = "timeline_reconcile"
dbg["matched"] = "date_ranges"
# 3. Calculate Engineering Years
eng_years = total_years
if is_recruiter:
# If they are a recruiter, their "engineering" experience is effectively 0
# for the purpose of finding a Developer.
eng_years = 0.0
return total_years, eng_years, confidence, dbg
def _norm_phone(p: str) -> str:
digits = re.sub(r"\D+", "", p)
if digits.startswith("8") and len(digits) == 11:
digits = "7" + digits[1:]
return "+" + digits if digits else ""
def _norm_token(s: str) -> str:
return re.sub(r"\s+", " ", s.strip().lower())
def safe_json(v) -> str:
return json.dumps(v, ensure_ascii=False)
def extract_contacts(text: str) -> Dict[str, List[str]]:
emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
for m in EMAIL_SPLIT_RE.finditer(text or ""):
prefix = m.group("prefix").strip().lower().strip(".-_")
if not prefix or prefix in _EMAIL_PREFIX_STOP:
continue
if not re.search(r"[._\-\d]", prefix):
continue
tail = m.group("tail").lower()
if "@" not in tail:
continue
local_tail, domain = tail.split("@", 1)
local = f"{prefix}{local_tail}"
if len(local) > 64:
continue
cand = f"{local}@{domain}"
if EMAIL_RE.fullmatch(cand):
emails_set.add(cand)
emails = _prune_fragment_emails(sorted(emails_set))
phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
def extract_name_guess(text: str) -> Optional[str]:
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if not lines:
return None
# 1) HH footer "Name • Резюме обновлено ..."
m = HH_FOOTER_RE.search(text or "")
if m:
cand = m.group("name").strip()
if _looks_like_name_line(cand):
return cand
# 2) Key-value line: "Name: ..." / "Имя: ..."
for ln in lines[:40]:
m2 = NAME_KV_RE.match(ln)
if m2:
cand = m2.group(2).strip()
cand = re.split(r"[|,/;]", cand)[0].strip()
if _looks_like_name_line(cand):
return cand
# 3) Name-like in first ~40 lines
for ln in lines[:40]:
if _looks_like_heading_line(ln):
continue
if _looks_like_name_line(ln):
return ln
# 4) Name-like near the end (pptx exports often put name there)
tail_start = max(0, len(lines) - 60)
for i in range(tail_start, len(lines)):
ln = lines[i]
if _looks_like_heading_line(ln):
continue
ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
if NAME_INSTITUTION_RE.search(ctx):
continue
if _looks_like_name_line(ln):
return ln
return None
def _looks_like_heading_line(line: str) -> bool:
low = (line or "").strip().lower()
if not low:
return False
if low in _NAME_BAD_WORDS:
return True
if low.startswith("top skills"):
return True
if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
return True
return False
def _looks_like_name_line(line: str) -> bool:
if not line:
return False
if len(line) > 80:
return False
low = line.lower().strip()
if low in NAME_STOPWORDS:
return False
if _looks_like_heading_line(line):
return False
if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
return False
if NAME_INSTITUTION_RE.search(line):
return False
if not NAME_LINE_RE.match(line.strip()):
return False
return True
def extract_remote(text: str) -> Optional[bool]:
if not text:
return None
for ln in text.splitlines()[:120]:
if REMOTE_RE.search(ln):
return True
return None
def extract_english(text: str) -> Optional[str]:
t = text or ""
lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
# 1) CEFR levels anywhere are accepted.
m = EN_RE.search(t)
if m:
return m.group(1).replace("+", "").upper()
# 2) Textual levels only when English context is present.
candidate_chunks: List[str] = []
for i, ln in enumerate(lines):
if EN_LANG_RE.search(ln):
candidate_chunks.append(ln)
if i + 1 < len(lines):
candidate_chunks.append(lines[i + 1])
if not candidate_chunks:
return None
m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
if not m2:
return None
word = m2.group(1).lower()
if word in ("native", "fluent", "proficient", "advanced"):
return "C1"
if word.startswith("upper"):
return "B2"
if word == "intermediate":
return "B1"
if word == "elementary":
return "A2"
return None
def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
"""
Extracts roles and skills, but strictly filters out HR/Recruitment context.
"""
lines = text.splitlines()
# 1. Filter text: Remove lines that talk about hiring/vacancies
clean_lines = []
for ln in lines:
if not HR_CONTEXT_RE.search(ln):
clean_lines.append(ln)
clean_text = "\n".join(clean_lines).lower()
# 2. Extract Skills from clean text only
skills = []
for s, pat in _SKILL_PATTERNS:
if pat.search(clean_text):
skills.append(normalize_skill(s) or s)
skills = sorted(set(skills))
# 3. Extract Roles
# Priority: Header (first 10 lines)
header_text = "\n".join(lines[:10]).lower()
found_roles = set()
# Check if Recruiter
if NON_TECH_ROLES_RE.search(header_text):
# If explicit recruiter in header, do NOT add generic tech roles like "backend"
# even if they appear in the text (often describes who they hire).
pass
else:
# Normal extraction
for r in ROLES:
pats = _ROLE_PATTERNS.get(r, [])
if any(p.search(clean_text) for p in pats):
# extra guard: devops requires explicit evidence, not just CI/CD mentions
if r == "devops":
if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
continue
found_roles.add(r)
return sorted(list(found_roles)), skills
def norm_pipe(tokens: List[str]) -> str:
toks = [_norm_token(t) for t in tokens if _norm_token(t)]
uniq = sorted(set(toks))
return "|" + "|".join(uniq) + "|" if uniq else "|"
def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
if not lines:
return None, None, 0.0, dbg
candidates: List[Tuple[int, str, bool, bool]] = []
for i, ln in enumerate(lines):
has_hint = SALARY_HINT_RE.search(ln) is not None
has_pay = PAY_TOKEN_RE.search(ln) is not None
if not has_hint and not has_pay:
continue
if SALARY_NOISE_RE.search(ln) and not has_hint:
continue
candidates.append((i, ln, has_hint, has_pay))
if not candidates:
return None, None, 0.0, dbg
has_hint = any(x[2] for x in candidates)
if not has_hint:
# Inline pay without "salary" is allowed only near header/contact block.
candidates = [x for x in candidates if x[0] < 15]
if not candidates:
return None, None, 0.0, dbg
scan_chunks: List[str] = []
for i, ln, hint, _ in candidates:
chunk = ln
if hint and (i + 1) < len(lines):
chunk = f"{chunk} {lines[i + 1]}"
scan_chunks.append(chunk)
dbg["used_lines"].append(ln)
if hint:
dbg["hint_lines"] += 1
dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
nums: List[int] = []
for chunk in scan_chunks:
for m in NUM_RE.finditer(chunk):
val = None
if m.group(1) and m.group(2):
val = int(m.group(1)) * 1000
elif m.group(3):
val = int(re.sub(r"\s+", "", m.group(3)))
elif m.group(4):
val = int(m.group(4))
if val and 20_000 <= val <= 30_000_000:
nums.append(val)
dbg["numbers"].append(val)
if not nums:
return None, None, 0.0, dbg
nums = sorted(nums)
salary_min = nums[0]
salary_max = nums[-1] if len(nums) > 1 else nums[0]
if dbg["hint_lines"] > 0:
conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
else:
conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
if salary_max > salary_min * 4:
conf -= 0.12
if len(nums) == 1:
conf -= 0.06
conf = max(0.0, min(conf, 0.9))
if conf < 0.45:
return None, None, conf, dbg
return salary_min, salary_max, conf, dbg
def extract_location_best_effort(text: str) -> Optional[str]:
if not text:
return None
def _clean_loc(val: str) -> str:
return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
v = _clean_loc(val)
if not v or len(v) < 3 or len(v) > 90:
return False
if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
return False
if SECTION_HEADER_RE.match(v):
return False
if LOCATION_CITY_COUNTRY_RE.match(v):
return True
if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
return True
return False
patterns = [
re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
]
for p in patterns:
m = p.search(text)
if m:
val = _clean_loc(m.group(2))
if _is_loc_like(val, allow_single=True):
return val
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
head: List[str] = []
for ln in lines[:60]:
if SECTION_HEADER_RE.match(ln):
low = ln.lower()
if low in ("contacts", "contact", "contact info"):
continue
break
head.append(ln)
for ln in head:
parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
for seg in parts:
if _is_loc_like(seg):
return _clean_loc(seg)
return None

211
extract/pdf_extract.py Normal file
View File

@@ -0,0 +1,211 @@
from __future__ import annotations
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple
try: # optional dependency
from pypdf import PdfReader # type: ignore
except Exception: # pragma: no cover
try:
from PyPDF2 import PdfReader # type: ignore
except Exception: # pragma: no cover
PdfReader = None # type: ignore
try: # optional dependency
from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore
except Exception: # pragma: no cover
pdfminer_extract_text = None # type: ignore
@dataclass
class PdfExtractResult:
text: str
pages: List[dict]
method: str
score: float
flags: List[str]
_SECTION_HINTS = [
"experience", "work experience", "skills", "education", "projects", "summary", "about",
"опыт работы", "навыки", "образование", "проекты", "о себе",
]
def _which_pdftotext() -> Optional[str]:
exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
return exe
def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
exe = _which_pdftotext()
if not exe:
return ""
cmd = [exe]
if layout:
cmd.append("-layout")
cmd += ["-nopgbrk", str(path), "-"]
try:
p = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout_sec,
check=False,
text=True,
encoding="utf-8",
errors="ignore",
)
return (p.stdout or "").strip()
except Exception:
return ""
def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
if PdfReader is None:
return []
try:
reader = PdfReader(str(path), strict=False)
except Exception:
return []
pages: List[dict] = []
for i, page in enumerate(getattr(reader, "pages", [])):
if max_pages and i >= max_pages:
break
try:
text = page.extract_text() or ""
except Exception:
text = ""
pages.append({"page": i + 1, "text": text})
return pages
def _extract_pdfminer(path: Path) -> str:
if pdfminer_extract_text is None:
return ""
try:
return (pdfminer_extract_text(str(path)) or "").strip()
except Exception:
return ""
def _quality_score(text: str) -> Tuple[float, List[str]]:
flags: List[str] = []
if not text:
return 0.0, ["empty"]
total = len(text)
letters = sum(ch.isalpha() for ch in text)
spaces = text.count(" ")
alpha_ratio = letters / max(1, total)
space_ratio = spaces / max(1, total)
words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
lines = [ln for ln in text.splitlines() if ln.strip()]
long_lines = [ln for ln in lines if len(ln) > 200]
long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
score = 0.0
if alpha_ratio >= 0.45:
score += 2.0
elif alpha_ratio >= 0.30:
score += 1.0
else:
flags.append("low_alpha")
if 0.10 <= space_ratio <= 0.28:
score += 1.0
else:
flags.append("odd_spacing")
if 3.5 <= avg_word_len <= 9.0:
score += 1.0
else:
flags.append("odd_word_len")
if long_line_ratio <= 0.06:
score += 1.0
else:
flags.append("long_lines")
if glued_hits <= 6:
score += 1.0
else:
flags.append("glued_text")
if section_hits >= 2:
score += 1.0
elif section_hits == 1:
score += 0.5
if total < 200:
flags.append("short_text")
if alpha_ratio < 0.08 or total < 120:
flags.append("scan_like")
return score, flags
def deglue_text(text: str) -> str:
if not text:
return text
t = text
t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
return t
def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
candidates: List[Tuple[str, str]] = []
txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
if txt_layout:
candidates.append(("pdftotext_layout", txt_layout))
txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
if txt_plain:
candidates.append(("pdftotext_plain", txt_plain))
txt_pypdf = ""
if PdfReader is not None:
pages = _extract_pages_pypdf(path)
if pages:
txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
if txt_pypdf:
candidates.append(("pypdf", txt_pypdf))
txt_pdfminer = _extract_pdfminer(path)
if txt_pdfminer:
candidates.append(("pdfminer", txt_pdfminer))
if not candidates:
return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
best_method = "none"
best_text = ""
best_score = -1.0
best_flags: List[str] = []
for method, text in candidates:
score, flags = _quality_score(text)
if score > best_score:
best_score = score
best_method = method
best_text = text
best_flags = flags
pages = _extract_pages_pypdf(path)
best_text = deglue_text(best_text)
return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)

70
extract/sections.py Normal file
View File

@@ -0,0 +1,70 @@
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
"contacts": [
re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
],
"about": [
re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
],
"skills": [
re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
],
"experience": [
re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
],
"education": [
re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
],
"projects": [
re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
],
"languages": [
re.compile(r"^\s*(languages?|языки)\s*$", re.I),
],
"certifications": [
re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
],
"publications": [
re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
],
}
def _match_header(line: str) -> Optional[str]:
for key, patterns in _SECTION_PATTERNS.items():
for rx in patterns:
if rx.match(line):
return key
return None
def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
lines = [ln.strip() for ln in (clean_text or "").splitlines()]
sections: Dict[str, List[str]] = {"header": []}
current = "header"
for ln in lines:
if not ln:
continue
key = _match_header(ln)
if key:
current = key
sections.setdefault(current, [])
continue
sections.setdefault(current, []).append(ln)
out: Dict[str, str] = {}
for k, vals in sections.items():
text = "\n".join(vals).strip()
if text:
out[k] = text
return out
def sections_present(sections: Dict[str, str]) -> List[str]:
return sorted([k for k, v in (sections or {}).items() if v and k != "header"])

View File

@@ -0,0 +1 @@
__all__ = []

View File

@@ -0,0 +1,46 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
text = clean_text or ""
contacts_raw = extract_contacts(text)
name = extract_name_guess(text)
remote = extract_remote(text)
english = extract_english(text)
roles, skills = extract_roles_skills(text)
location = extract_location_best_effort(text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "generic_heur",
}

58
extract/templates/hh.py Normal file
View File

@@ -0,0 +1,58 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
if not sections:
return fallback
return sections.get(key) or fallback
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
header_text = _pick(sections, "header", clean_text)
contacts_text = _pick(sections, "contacts", clean_text)
about_text = _pick(sections, "about", clean_text)
skills_text = _pick(sections, "skills", clean_text)
exp_text = _pick(sections, "experience", clean_text)
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
name = extract_name_guess(header_text)
contacts_raw = extract_contacts(contacts_text)
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
remote = extract_remote(clean_text)
english = extract_english(clean_text)
location = extract_location_best_effort(clean_text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "hh_template",
}

View File

@@ -0,0 +1,85 @@
from __future__ import annotations
import re
from typing import Any, Dict, Optional
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
_DESIRED_RE = re.compile(r"(?i)жел[а-я]*\s+должност[ьи]\s*[:\-]?\s*(.+)")
_SPEC_RE = re.compile(r"(?i)специализаци[яи]\s*[:\-]?\s*(.+)")
_SCHEDULE_RE = re.compile(r"(?i)график\s+работы\s*[:\-]?\s*(.+)")
_EMPLOYMENT_RE = re.compile(r"(?i)занятость\s*[:\-]?\s*(.+)")
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
if not sections:
return fallback
return sections.get(key) or fallback
def _find_first(regex: re.Pattern, text: str) -> Optional[str]:
for ln in text.splitlines():
m = regex.search(ln)
if m:
val = m.group(1).strip()
val = re.split(r"[|;/]", val)[0].strip()
if 2 <= len(val) <= 80:
return val
return None
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
header_text = _pick(sections, "header", clean_text)
contacts_text = _pick(sections, "contacts", clean_text)
about_text = _pick(sections, "about", clean_text)
skills_text = _pick(sections, "skills", clean_text)
exp_text = _pick(sections, "experience", clean_text)
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
name = extract_name_guess(header_text)
contacts_raw = extract_contacts(contacts_text)
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
remote = extract_remote(clean_text)
english = extract_english(clean_text)
location = extract_location_best_effort(clean_text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
desired_title = _find_first(_DESIRED_RE, clean_text)
specializations = _find_first(_SPEC_RE, clean_text)
schedule = _find_first(_SCHEDULE_RE, clean_text)
employment = _find_first(_EMPLOYMENT_RE, clean_text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"desired_title": desired_title,
"specializations": specializations,
"employment_type": employment,
"schedule": schedule,
"parse_method": "hh_template",
}

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
if not sections:
return fallback
return sections.get(key) or fallback
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
header_text = _pick(sections, "header", clean_text)
about_text = _pick(sections, "about", clean_text)
skills_text = _pick(sections, "skills", clean_text)
exp_text = _pick(sections, "experience", clean_text)
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
name = extract_name_guess(header_text)
contacts_raw = extract_contacts(clean_text)
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
remote = extract_remote(clean_text)
english = extract_english(clean_text)
location = extract_location_best_effort(clean_text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "linkedin_template",
}

View File

@@ -0,0 +1,46 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
text = clean_text or ""
contacts_raw = extract_contacts(text)
name = extract_name_guess(text)
roles, skills = extract_roles_skills(text)
remote = extract_remote(text)
english = extract_english(text)
location = extract_location_best_effort(text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "one_page_template",
}

View File

@@ -0,0 +1,11 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.templates.one_page import parse_resume as _parse
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
out = _parse(clean_text, sections)
out["parse_method"] = "one_page_en"
return out

View File

@@ -0,0 +1,11 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.templates.one_page import parse_resume as _parse
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
out = _parse(clean_text, sections)
out["parse_method"] = "one_page_ru"
return out

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from typing import Any, Dict
from tg_resume_db.extract.parse import (
extract_contacts,
extract_name_guess,
extract_remote,
extract_english,
extract_roles_skills,
extract_salary,
extract_location_best_effort,
extract_experience_years,
)
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
text = clean_text or ""
contacts_raw = extract_contacts(text)
name = extract_name_guess(text)
roles, skills = extract_roles_skills(text)
remote = extract_remote(text)
english = extract_english(text)
location = extract_location_best_effort(text)
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
return {
"name": name,
"contacts_raw": contacts_raw,
"remote": remote,
"english": english,
"roles": roles,
"skills": skills,
"location": location,
"exp_years": exp_years,
"exp_years_eng": exp_years_eng,
"exp_conf": exp_conf,
"exp_dbg": exp_dbg,
"salary_min": sal_min,
"salary_max": sal_max,
"salary_conf": sal_conf,
"salary_dbg": sal_dbg,
"parse_method": "pptx_template",
}

99
extract/text_extract.py Normal file
View File

@@ -0,0 +1,99 @@
from __future__ import annotations
import os
from pathlib import Path
import logging
from bs4 import BeautifulSoup
try: # optional dependency for PDF fallback
from pypdf import PdfReader as _PdfReader # type: ignore
except Exception: # pragma: no cover - optional import
try:
from PyPDF2 import PdfReader as _PdfReader # type: ignore
except Exception: # pragma: no cover
_PdfReader = None # type: ignore
def _read_bytes(path: Path) -> bytes:
return path.read_bytes()
def extract_text_from_txt(path: Path) -> str:
data = _read_bytes(path)
for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
try:
return data.decode(enc, errors="ignore")
except Exception:
continue
return data.decode("utf-8", errors="ignore")
def extract_text_from_html(path: Path) -> str:
html = extract_text_from_txt(path)
soup = BeautifulSoup(html, "lxml")
return soup.get_text("\n", strip=True)
def extract_text_from_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
parts = []
for p in doc.paragraphs:
if p.text and p.text.strip():
parts.append(p.text.strip())
for table in doc.tables:
for row in table.rows:
cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
if cells:
parts.append(" | ".join(cells))
return "\n".join(parts)
_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
logging.getLogger("pypdf").setLevel(logging.ERROR)
logging.getLogger("PyPDF2").setLevel(logging.ERROR)
def extract_text_from_pdf(path: Path) -> str:
"""
Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
"""
if _PdfReader is None:
raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
try:
reader = _PdfReader(str(path), strict=False)
except Exception as exc: # pragma: no cover - pdf parser edge cases
raise RuntimeError(f"PDF read failed: {exc}") from exc
parts = []
for idx, page in enumerate(getattr(reader, "pages", [])):
if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
break
try:
text = page.extract_text() # type: ignore[attr-defined]
except Exception:
text = None
if text:
parts.append(text)
return "\n".join(parts)
def extract_text_from_doc_best_effort(path: Path) -> str:
# .doc requires external tools; best-effort if textract installed
try:
import textract # type: ignore
b = textract.process(str(path))
return b.decode("utf-8", errors="ignore")
except Exception:
return ""
def extract_text(path: Path) -> str:
ext = path.suffix.lower()
if ext in (".txt", ".log"):
return extract_text_from_txt(path)
if ext in (".html", ".htm"):
return extract_text_from_html(path)
if ext == ".docx":
return extract_text_from_docx(path)
if ext == ".pdf":
return extract_text_from_pdf(path)
if ext == ".doc":
return extract_text_from_doc_best_effort(path)
return ""

21
importers/file_scan.py Normal file
View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from pathlib import Path
from typing import Dict, Iterator
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
def iter_files(root: Path) -> Iterator[Dict]:
for p in root.rglob("*"):
if p.is_file() and p.suffix.lower() in RESUME_EXTS:
yield {
"origin_type": "file_scan",
"export_path": str(root),
"chat_title": None,
"message_id": None,
"message_date": None,
"message_text": "",
"file_path": str(p.resolve()),
"original_name": p.name,
"extra": {},
}

View File

@@ -0,0 +1,66 @@
from __future__ import annotations
import re
from pathlib import Path
from typing import Dict, Iterator, List, Optional
from bs4 import BeautifulSoup
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
def find_messages_html(root: Path) -> List[Path]:
return [p for p in root.rglob("messages*.html") if p.is_file()]
def iter_artifacts(messages_html: Path) -> Iterator[Dict]:
html = messages_html.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(html, "lxml")
chat_title = None
h = soup.find(class_=re.compile(r"page_header", re.I))
if h:
chat_title = h.get_text(" ", strip=True)
chat_title = chat_title or messages_html.parent.name
for msg in soup.select(".message.default.clearfix, .message"):
message_id = msg.get("id") or None
date_div = msg.select_one(".date")
msg_date = date_div.get("title") if date_div else None
text_div = msg.select_one(".text")
msg_text = text_div.get_text("\n", strip=True) if text_div else ""
file_path = None
original_name = None
for a in msg.find_all("a", href=True):
href = a["href"]
p = (messages_html.parent / href).resolve()
if p.exists() and p.suffix.lower() in RESUME_EXTS:
file_path = str(p)
original_name = p.name
break
if file_path:
yield {
"origin_type": "telegram_html",
"export_path": str(messages_html.parent),
"chat_title": chat_title,
"message_id": str(message_id) if message_id else None,
"message_date": msg_date,
"message_text": msg_text or "",
"file_path": file_path,
"original_name": original_name,
"extra": {"html_path": str(messages_html)},
}
else:
if msg_text and len(msg_text.strip()) >= 500:
yield {
"origin_type": "message_text",
"export_path": str(messages_html.parent),
"chat_title": chat_title,
"message_id": str(message_id) if message_id else None,
"message_date": msg_date,
"message_text": msg_text,
"file_path": None,
"original_name": None,
"extra": {"html_path": str(messages_html)},
}

View File

@@ -0,0 +1,73 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Dict, Iterator, List, Optional
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
def find_result_json(root: Path) -> List[Path]:
return list(root.rglob("result.json"))
def _text_field_to_str(text_field) -> str:
if isinstance(text_field, str):
return text_field
if isinstance(text_field, list):
parts = []
for item in text_field:
if isinstance(item, str):
parts.append(item)
elif isinstance(item, dict) and "text" in item:
parts.append(str(item["text"]))
return "".join(parts)
return ""
def iter_artifacts(result_json: Path) -> Iterator[Dict]:
data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore"))
chats = []
if isinstance(data, dict):
chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or []
for chat in chats:
chat_title = chat.get("name") or chat.get("title") or "unknown_chat"
messages = chat.get("messages", []) or []
for msg in messages:
msg_id = str(msg.get("id") or "")
msg_date = msg.get("date") or msg.get("date_unixtime") or None
text = _text_field_to_str(msg.get("text", ""))
file_rel = msg.get("file") or None
file_path = None
original_name = None
if file_rel:
p = (result_json.parent / file_rel).resolve()
if p.exists() and p.suffix.lower() in RESUME_EXTS:
file_path = str(p)
original_name = p.name
if file_path:
yield {
"origin_type": "telegram_json",
"export_path": str(result_json.parent),
"chat_title": chat_title,
"message_id": msg_id,
"message_date": str(msg_date) if msg_date is not None else None,
"message_text": text or "",
"file_path": file_path,
"original_name": original_name,
"extra": {"json_path": str(result_json)},
}
else:
# message-only resume paste (heuristic)
if text and len(text.strip()) >= 500:
yield {
"origin_type": "message_text",
"export_path": str(result_json.parent),
"chat_title": chat_title,
"message_id": msg_id,
"message_date": str(msg_date) if msg_date is not None else None,
"message_text": text,
"file_path": None,
"original_name": None,
"extra": {"json_path": str(result_json)},
}

174
normalize.py Normal file
View File

@@ -0,0 +1,174 @@
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
_SKILL_SYNONYMS: Dict[str, List[str]] = {
"python": ["py"],
"javascript": ["js", "node", "nodejs", "java script", "java-script"],
"typescript": ["ts", "type script", "type-script"],
"postgresql": ["postgres", "psql"],
"kubernetes": ["k8s"],
"docker": [],
"fastapi": [],
"django": ["drf", "django rest framework"],
"flask": [],
"golang": ["go"],
"c++": ["cpp"],
"c#": ["csharp"],
"redis": [],
"kafka": [],
"rabbitmq": [],
"grpc": [],
"rest": [],
}
_SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
_ROLE_SYNONYMS: Dict[str, List[str]] = {
"backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
"frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
"fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
"devops": ["sre", "site reliability"],
"qa": ["tester", "тестировщик"],
"data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
"mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
}
def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
alias = {}
for canonical, al in src.items():
alias[canonical] = canonical
for a in al:
alias[a] = canonical
return {k.lower(): v for k, v in alias.items()}
_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
def _normalize_skill_surface(token: str) -> str:
t = (token or "").strip().lower()
if not t:
return ""
t = t.replace("/", " ")
t = re.sub(r"[_\-]+", " ", t)
t = re.sub(r"\s+", " ", t).strip()
# "java script", "type script", "postgre sql", "graph ql", "g rpc"
t = re.sub(r"\bjava\s+script\b", "javascript", t)
t = re.sub(r"\btype\s+script\b", "typescript", t)
t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
t = re.sub(r"\bg\s+rpc\b", "grpc", t)
t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
return t
def normalize_skill(token: str) -> Optional[str]:
t = _normalize_skill_surface(token)
if not t:
return None
# Avoid false-positive java from "javascript"
if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
return "javascript"
return _SKILL_ALIAS.get(t, t)
def normalize_skills(skills: List[str]) -> List[str]:
out: List[str] = []
seen = set()
for s in skills or []:
canon = normalize_skill(s)
if not canon or canon in seen:
continue
seen.add(canon)
out.append(canon)
return out
def normalize_role(token: str) -> Optional[str]:
t = (token or "").strip().lower()
if not t:
return None
return _ROLE_ALIAS.get(t, t)
def normalize_roles(roles: List[str]) -> List[str]:
out: List[str] = []
seen = set()
for r in roles or []:
canon = normalize_role(r)
if not canon or canon in seen:
continue
seen.add(canon)
out.append(canon)
return out
def split_skills_primary_secondary(
skills: List[str],
*,
clean_text: str,
sections: Dict[str, str] | None = None,
primary_limit: int = 25,
) -> Tuple[List[str], List[str]]:
if not skills:
return [], []
text = (clean_text or "").lower()
skills_section = (sections or {}).get("skills", "").lower()
experience_section = (sections or {}).get("experience", "").lower()
scores: Dict[str, float] = {}
for sk in skills:
s = sk.lower()
score = 1.0
if s in skills_section:
score += 2.2
if s in experience_section:
score += 1.2
count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
score += min(2.5, count * 0.5)
if s in _SKILL_STOP:
score -= 1.5
scores[sk] = score
ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
secondary = [s for s in ranked if s not in primary]
return primary, secondary
def normalize_location(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
t = raw.strip()
low = t.lower()
if low in ("москва", "moscow", "moscow, russia"):
return "Moscow, Russia"
if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
return "Saint Petersburg, Russia"
return t
def find_skills_in_text(text: str) -> List[str]:
if not text:
return []
found: List[str] = []
seen = set()
low = _normalize_skill_surface(text)
for alias, canon in _SKILL_ALIAS.items():
key = _normalize_skill_surface(alias)
if key in seen:
continue
if re.search(r"\b" + re.escape(key) + r"\b", low):
if canon not in seen:
found.append(canon)
seen.add(canon)
return found

45
pdf_merge.py Normal file
View File

@@ -0,0 +1,45 @@
from __future__ import annotations
from pathlib import Path
from typing import Iterable, List, Optional
from pypdf import PdfReader, PdfWriter
def merge_pdfs(pdf_paths: Iterable[str | Path], out_pdf_path: str | Path) -> dict:
out_pdf_path = Path(out_pdf_path)
out_pdf_path.parent.mkdir(parents=True, exist_ok=True)
writer = PdfWriter()
merged: List[str] = []
skipped: List[str] = []
for p in pdf_paths:
path = Path(p)
try:
reader = PdfReader(str(path))
# просто добавляем страницы подряд
for page in reader.pages:
writer.add_page(page)
merged.append(str(path))
except Exception:
skipped.append(str(path))
if merged:
with out_pdf_path.open("wb") as f:
writer.write(f)
return {
"out_pdf": str(out_pdf_path),
"merged_count": len(merged),
"skipped_count": len(skipped),
"merged_files": merged,
"skipped_files": skipped,
}
def merge_all_pdfs_in_dir(files_dir: str | Path, out_pdf_path: str | Path) -> dict:
files_dir = Path(files_dir)
pdfs = sorted(files_dir.rglob("*.pdf")) + sorted(files_dir.rglob("*.PDF"))
return merge_pdfs(pdfs, out_pdf_path)

1990
pipeline.py Normal file

File diff suppressed because it is too large Load Diff

393
search.py Normal file
View File

@@ -0,0 +1,393 @@
from __future__ import annotations
import json
import re
import sqlite3
from typing import Any, Dict, List, Tuple
from tg_resume_db.normalize import normalize_skill, find_skills_in_text
# -----------------------------
# Normalization helpers
# -----------------------------
def _norm_token(v: str) -> str:
return " ".join(str(v).strip().lower().split())
def _as_list(v: Any) -> List[str]:
"""
Accepts:
- None
- list
- "a,b,c" (csv string)
"""
if v is None:
return []
if isinstance(v, list):
return [str(x) for x in v if str(x).strip()]
s = str(v).strip()
if not s:
return []
return [x.strip() for x in s.split(",") if x.strip()]
def _uniq_keep_order(xs: List[str]) -> List[str]:
seen = set()
out: List[str] = []
for x in xs:
t = _norm_token(x)
if not t or t in seen:
continue
seen.add(t)
out.append(t)
return out
# -----------------------------
# Pipe-normalized columns filters
# skills_norm / roles_norm like: "|python|fastapi|"
# -----------------------------
def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
if not vals:
return ("1=1", [])
parts: List[str] = []
args: List[Any] = []
for v in vals:
parts.append(f"instr({field}, ?) > 0")
args.append(f"|{v}|")
return "(" + " OR ".join(parts) + ")", args
def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
if not vals:
return ("1=1", [])
parts: List[str] = []
args: List[Any] = []
for v in vals:
parts.append(f"instr({field}, ?) > 0")
args.append(f"|{v}|")
return "(" + " AND ".join(parts) + ")", args
# -----------------------------
# FTS5 sanitizer (fixes comma/garbage breaking MATCH)
# -----------------------------
# allow longer queries (списки имён, длинные промпты) без агрессивного усечения
_FTS_MAX_TERMS = 48
def _fts_safe_query(q: str) -> str:
"""
Turn a free-form recruiter text into a safe FTS5 MATCH expression.
We intentionally DO NOT allow raw FTS syntax from user input,
because it easily breaks on commas/quotes/etc.
Example:
"Backend developer, опыт 5+ лет, Java C++ Python" ->
"\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\""
"""
if not q:
return "resume"
s = q.strip().lower()
# normalize common tokens
s = s.replace("c++", "cpp")
s = s.replace("c#", "csharp")
s = s.replace(".net", "dotnet")
# remove punctuation that breaks MATCH
s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s)
s = re.sub(r"\s+", " ", s).strip()
# tokens (latin/cyrillic + digits + a few chars)
terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I)
terms = terms[:_FTS_MAX_TERMS]
if not terms:
return "resume"
# quote every term => safe; join with OR => broad query
return " OR ".join([f"\"{t}\"" for t in terms])
def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]:
"""
Extract +must and -exclude skills from query; return (must, exclude, cleaned_query).
"""
if not q:
return [], [], ""
must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q)
excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q)
must = []
exclude = []
for t in must_raw:
canon = normalize_skill(t)
if canon:
must.append(canon)
for t in excl_raw:
canon = normalize_skill(t)
if canon:
exclude.append(canon)
if " and " in q.lower() or " & " in q:
must += find_skills_in_text(q)
cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned
# -----------------------------
# Contacts
# -----------------------------
def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]:
rows = con.execute(
"SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?",
(candidate_id,),
).fetchall()
m: Dict[str, List[str]] = {}
for r in rows:
m.setdefault(r["contact_type"], []).append(r["contact_value"])
# чуть чище: уберём дубль-контакты
for k, vals in list(m.items()):
m[k] = _uniq_keep_order(vals)
return m
# -----------------------------
# Main search (FTS + filters)
# -----------------------------
def search(
con: sqlite3.Connection,
query: str,
filters: Dict[str, Any],
limit: int = 20,
offset: int = 0,
) -> List[Dict[str, Any]]:
"""
Search candidates using:
- FTS5 for ranking/snippet
- stack filters for skills/roles via pipe-normalized columns
- basic filters: remote/location/experience/salary/english
"""
where: List[str] = ["r.is_active = 1"]
params: List[Any] = []
must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "")
# -------- basic filters --------
if filters.get("remote") is not None:
where.append("c.remote = ?")
params.append(1 if bool(filters["remote"]) else 0)
if filters.get("location"):
where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?")
params.append("%" + str(filters["location"]).lower() + "%")
# Используем experience_years для SQL-фильтрации (широкий поиск),
# а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py
if filters.get("experience_min") is not None:
where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?")
params.append(float(filters["experience_min"]))
# Salary: "unknown salary doesn't exclude"
if filters.get("salary_min") is not None:
where.append("(c.salary_max IS NULL OR c.salary_max >= ?)")
params.append(int(filters["salary_min"]))
if filters.get("salary_max") is not None:
where.append("(c.salary_min IS NULL OR c.salary_min <= ?)")
params.append(int(filters["salary_max"]))
if filters.get("doc_type"):
where.append("r.doc_type = ?")
params.append(str(filters["doc_type"]))
# English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py
# -------- roles/skills stack filters --------
# backward compatibility
skills_any: List[str] = []
skills_all: List[str] = []
roles_any: List[str] = []
if filters.get("skill"):
skills_any.append(str(filters["skill"]))
if filters.get("role"):
roles_any.append(str(filters["role"]))
skills_any += _as_list(filters.get("skills_any"))
skills_all += _as_list(filters.get("skills_all"))
roles_any += _as_list(filters.get("roles_any"))
skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any])
skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all])
roles_any = _uniq_keep_order(roles_any)
if must_skills:
skills_all = _uniq_keep_order(skills_all + must_skills)
# Denis rule: if any skills were provided -> enforce ANY match
if skills_any:
clause, args = _pipe_any_clause("c.skills_norm", skills_any)
where.append(clause)
params.extend(args)
if skills_all:
clause, args = _pipe_all_clause("c.skills_norm", skills_all)
where.append(clause)
params.extend(args)
if roles_any:
clause, args = _pipe_any_clause("c.roles_norm", roles_any)
where.append(clause)
params.extend(args)
if exclude_skills:
for sk in exclude_skills:
where.append("instr(c.skills_norm, ?) = 0")
params.append(f"|{sk}|")
# -------- FTS query (SAFE) --------
fts_q = _fts_safe_query(cleaned_query or "")
limit = max(1, min(int(limit or 20), 100))
offset = max(0, int(offset or 0))
# UPDATED SQL: Added experience_years_eng and language/backend metadata
sql = f"""
SELECT
c.candidate_id,
c.name,
c.location,
c.remote,
c.experience_years,
c.experience_years_eng,
c.experience_confidence,
c.salary_min,
c.salary_max,
c.salary_confidence,
c.english_level,
c.roles_json,
c.skills_json,
c.primary_languages_json,
c.backend_focus,
r.doc_type,
r.doc_type_confidence,
r.parse_method,
r.resume_id,
snippet(resumes_fts, 2, '[', ']', '', 14) AS snippet,
bm25(resumes_fts) AS rank
FROM resumes_fts
JOIN resumes r ON r.resume_id = resumes_fts.resume_id
JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id
WHERE resumes_fts MATCH ? AND {" AND ".join(where)}
ORDER BY rank
LIMIT ? OFFSET ?
"""
rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall()
out: List[Dict[str, Any]] = []
for row in rows:
cand_id = row["candidate_id"]
contacts_map = _fetch_contacts_map(con, cand_id)
out.append(
{
"candidate_id": cand_id,
"name": row["name"],
"location": row["location"],
"remote": bool(row["remote"]) if row["remote"] is not None else None,
"experience_years": row["experience_years"],
"experience_years_eng": row["experience_years_eng"], # Passed to agent
"experience_confidence": row["experience_confidence"],
"salary_min": row["salary_min"],
"salary_max": row["salary_max"],
"salary_confidence": row["salary_confidence"],
"english_level": row["english_level"],
"roles": json.loads(row["roles_json"] or "[]"),
"skills": json.loads(row["skills_json"] or "[]"),
"primary_languages": json.loads(row["primary_languages_json"] or "[]"),
"backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None),
"doc_type": row["doc_type"],
"doc_type_confidence": row["doc_type_confidence"],
"parse_method": row["parse_method"],
"contacts": contacts_map,
"resume_id": row["resume_id"],
"snippet": row["snippet"],
"rank": row["rank"],
}
)
return out
# -----------------------------
# Agent helper (SearchPlan -> search())
# -----------------------------
def _join_csv(xs: List[str]) -> str:
xs = [str(x).strip() for x in (xs or []) if str(x).strip()]
return ",".join(xs)
def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]:
"""
Wrapper for agent.py.
Expects `plan` with fields:
query_text, skills_any, skills_all, roles_any, location, remote,
english_min, exp_years_min, salary_min, salary_max, limit, sort
Returns:
{ "items": [...], "count": N }
"""
filters = {
"remote": getattr(plan, "remote", None),
"location": getattr(plan, "location", None),
"experience_min": getattr(plan, "exp_years_min", None),
"salary_min": getattr(plan, "salary_min", None),
"salary_max": getattr(plan, "salary_max", None),
"english": getattr(plan, "english_min", None),
"roles_any": _join_csv(getattr(plan, "roles_any", []) or []),
"skills_any": _join_csv(getattr(plan, "skills_any", []) or []),
"skills_all": _join_csv(getattr(plan, "skills_all", []) or []),
}
items = search(
con,
query=(getattr(plan, "query_text", "") or "").strip(),
filters=filters,
limit=int(getattr(plan, "limit", 20) or 20),
offset=0,
)
sort_mode = (getattr(plan, "sort", "rank") or "rank").strip()
if sort_mode == "exp_desc":
def k(it: Dict[str, Any]):
v = it.get("experience_years")
return (v is None, -(v or 0.0))
items = sorted(items, key=k)
elif sort_mode == "salary_desc":
def k(it: Dict[str, Any]):
v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min")
return (v is None, -(v or 0))
items = sorted(items, key=k)
return {"items": items, "count": len(items)}

33
util.py Normal file
View File

@@ -0,0 +1,33 @@
from __future__ import annotations
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
def utc_iso() -> str:
return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
class Logger:
def __init__(self, log_path: Optional[str] = None):
self.log_path = Path(log_path) if log_path else None
if self.log_path:
self.log_path.parent.mkdir(parents=True, exist_ok=True)
def _write(self, level: str, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
line = f"{utc_iso()} [{level}] {msg}"
print(line, file=sys.stdout, flush=True)
if self.log_path:
payload = {"ts": utc_iso(), "level": level, "msg": msg, "extra": extra or {}}
with self.log_path.open("a", encoding="utf-8") as f:
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
def info(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
self._write("INFO", msg, extra)
def warn(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
self._write("WARN", msg, extra)
def error(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
self._write("ERROR", msg, extra)