Initial commit
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
2
__init__.py
Normal file
2
__init__.py
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
__all__ = []
|
||||||
|
__version__ = "1.0.0"
|
||||||
77
api.py
Normal file
77
api.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from tg_resume_db.db import connect, init_db
|
||||||
|
from tg_resume_db.agent import agent_search
|
||||||
|
from tg_resume_db.search import search as db_search
|
||||||
|
|
||||||
|
DB_PATH = os.environ.get("CANDIDATES_DB", "./candidates.db")
|
||||||
|
|
||||||
|
app = FastAPI(title="Resume Search API", version="1.0")
|
||||||
|
|
||||||
|
class SearchRequest(BaseModel):
|
||||||
|
query: str = Field(default="")
|
||||||
|
limit: int = Field(default=20, ge=1, le=100)
|
||||||
|
offset: int = Field(default=0, ge=0)
|
||||||
|
remote: Optional[bool] = None
|
||||||
|
location: Optional[str] = None
|
||||||
|
experience_min: Optional[float] = None
|
||||||
|
salary_min: Optional[int] = None
|
||||||
|
salary_max: Optional[int] = None
|
||||||
|
english: Optional[str] = None
|
||||||
|
role: Optional[str] = None
|
||||||
|
skill: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class AISearchRequest(BaseModel):
|
||||||
|
prompt: str = Field(default="")
|
||||||
|
limit: int = Field(default=20, ge=1, le=100)
|
||||||
|
ai_iters: int = Field(default=2, ge=0, le=5)
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
def _startup():
|
||||||
|
con = connect(DB_PATH)
|
||||||
|
init_db(con)
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"ok": True}
|
||||||
|
|
||||||
|
@app.post("/search")
|
||||||
|
def search(req: SearchRequest) -> Dict[str, Any]:
|
||||||
|
con = connect(DB_PATH)
|
||||||
|
try:
|
||||||
|
items = db_search(con, query=req.query, filters=req.model_dump(), limit=req.limit, offset=req.offset)
|
||||||
|
return {"items": items, "count": len(items)}
|
||||||
|
finally:
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/search/ai")
|
||||||
|
def search_ai(req: AISearchRequest) -> Dict[str, Any]:
|
||||||
|
con = connect(DB_PATH)
|
||||||
|
try:
|
||||||
|
res = agent_search(
|
||||||
|
con,
|
||||||
|
user_prompt=req.prompt,
|
||||||
|
max_iters=req.ai_iters,
|
||||||
|
limit=req.limit,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"ai": True,
|
||||||
|
"llm_used": res.get("llm_used", False),
|
||||||
|
"plan": res.get("plan"),
|
||||||
|
"history": res.get("history"),
|
||||||
|
"postfilter": res.get("postfilter"),
|
||||||
|
"items": res.get("items", []),
|
||||||
|
"count": int(res.get("count", 0)),
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
con.close()
|
||||||
267
bundle_export.py
Normal file
267
bundle_export.py
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
|
# NEW: PDF merge helper (pypdf)
|
||||||
|
# pip install pypdf
|
||||||
|
try:
|
||||||
|
from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
|
||||||
|
except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили
|
||||||
|
merge_all_pdfs_in_dir = None
|
||||||
|
|
||||||
|
|
||||||
|
def _slug(s: str, max_len: int = 60) -> str:
|
||||||
|
s = (s or "").strip()
|
||||||
|
if not s:
|
||||||
|
return "candidate"
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
|
||||||
|
s = s.replace(" ", "_")
|
||||||
|
s = re.sub(r"_+", "_", s).strip("_")
|
||||||
|
if not s:
|
||||||
|
return "candidate"
|
||||||
|
return s[:max_len]
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_mkdir(p: Path) -> None:
|
||||||
|
p.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Возвращает список самых приоритетных путей к файлу резюме.
|
||||||
|
1) resumes.file_path
|
||||||
|
2) sources.original_file_path
|
||||||
|
3) некоторые варианты путей из sources.extra_json
|
||||||
|
"""
|
||||||
|
paths: List[str] = []
|
||||||
|
|
||||||
|
row = con.execute(
|
||||||
|
"SELECT file_path FROM resumes WHERE resume_id=?",
|
||||||
|
(resume_id,),
|
||||||
|
).fetchone()
|
||||||
|
if row and row["file_path"]:
|
||||||
|
paths.append(str(row["file_path"]))
|
||||||
|
|
||||||
|
cur = con.execute(
|
||||||
|
"""SELECT original_file_path, original_file_name, extra_json
|
||||||
|
FROM sources
|
||||||
|
WHERE resume_id=?""",
|
||||||
|
(resume_id,),
|
||||||
|
)
|
||||||
|
for r in cur.fetchall():
|
||||||
|
ofp = r["original_file_path"]
|
||||||
|
if ofp:
|
||||||
|
paths.append(str(ofp))
|
||||||
|
|
||||||
|
try:
|
||||||
|
extra = json.loads(r["extra_json"] or "{}")
|
||||||
|
if isinstance(extra, dict):
|
||||||
|
for k in ("file_path", "path", "local_path", "source_path"):
|
||||||
|
if extra.get(k):
|
||||||
|
paths.append(str(extra[k]))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# дедуп
|
||||||
|
seen = set()
|
||||||
|
out: List[str] = []
|
||||||
|
for p in paths:
|
||||||
|
p2 = os.path.normpath(p)
|
||||||
|
if p2 in seen:
|
||||||
|
continue
|
||||||
|
seen.add(p2)
|
||||||
|
out.append(p2)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
|
||||||
|
ext = src.suffix.lower() if src.suffix else ""
|
||||||
|
candidate = f"{base_name}{ext}"
|
||||||
|
dst = dst_dir / candidate
|
||||||
|
|
||||||
|
if dst.exists():
|
||||||
|
i = 2
|
||||||
|
while True:
|
||||||
|
dst = dst_dir / f"{base_name}({i}){ext}"
|
||||||
|
if not dst.exists():
|
||||||
|
break
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
shutil.copy2(src, dst)
|
||||||
|
return dst
|
||||||
|
|
||||||
|
|
||||||
|
def bundle_search_results(
|
||||||
|
con: sqlite3.Connection,
|
||||||
|
results: Iterable[Dict[str, Any]],
|
||||||
|
out_dir: str,
|
||||||
|
*,
|
||||||
|
copy_files: bool = True,
|
||||||
|
merge_text: bool = True,
|
||||||
|
merge_pdf: bool = True, # NEW
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
results: iterable dictов где есть минимум:
|
||||||
|
- resume_id
|
||||||
|
- candidate_id
|
||||||
|
- name (желательно)
|
||||||
|
|
||||||
|
Создаёт:
|
||||||
|
- files/: скопированные исходные файлы резюме
|
||||||
|
- merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
|
||||||
|
- pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
|
||||||
|
- manifest.json
|
||||||
|
- README.txt
|
||||||
|
"""
|
||||||
|
out_root = Path(out_dir).resolve()
|
||||||
|
files_dir = out_root / "files"
|
||||||
|
_safe_mkdir(files_dir)
|
||||||
|
|
||||||
|
manifest: List[Dict[str, Any]] = []
|
||||||
|
copied = 0
|
||||||
|
missing = 0
|
||||||
|
|
||||||
|
merged_parts: List[str] = []
|
||||||
|
merged_txt_path = out_root / "merged_resumes.txt"
|
||||||
|
|
||||||
|
for item in results:
|
||||||
|
resume_id = item.get("resume_id")
|
||||||
|
cand_id = item.get("candidate_id")
|
||||||
|
name = item.get("name") or ""
|
||||||
|
if not resume_id or not cand_id:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# merged TXT из БД
|
||||||
|
if merge_text:
|
||||||
|
row = con.execute(
|
||||||
|
"SELECT clean_text FROM resumes WHERE resume_id=?",
|
||||||
|
(resume_id,),
|
||||||
|
).fetchone()
|
||||||
|
clean_text = (row["clean_text"] if row else "") or ""
|
||||||
|
header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
|
||||||
|
merged_parts.append(header)
|
||||||
|
merged_parts.append(clean_text.strip())
|
||||||
|
merged_parts.append("")
|
||||||
|
|
||||||
|
if not copy_files:
|
||||||
|
continue
|
||||||
|
|
||||||
|
src_paths = _pick_source_paths(con, resume_id)
|
||||||
|
|
||||||
|
src_found: Optional[Path] = None
|
||||||
|
for sp in src_paths:
|
||||||
|
p = Path(sp)
|
||||||
|
if p.exists() and p.is_file():
|
||||||
|
src_found = p
|
||||||
|
break
|
||||||
|
|
||||||
|
if not src_found:
|
||||||
|
missing += 1
|
||||||
|
manifest.append(
|
||||||
|
{
|
||||||
|
"candidate_id": cand_id,
|
||||||
|
"name": name,
|
||||||
|
"resume_id": resume_id,
|
||||||
|
"copied": False,
|
||||||
|
"reason": "source_file_not_found",
|
||||||
|
"tried_paths": src_paths,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
|
||||||
|
try:
|
||||||
|
dst = _copy_unique(src_found, files_dir, base)
|
||||||
|
copied += 1
|
||||||
|
manifest.append(
|
||||||
|
{
|
||||||
|
"candidate_id": cand_id,
|
||||||
|
"name": name,
|
||||||
|
"resume_id": resume_id,
|
||||||
|
"copied": True,
|
||||||
|
"source_path": str(src_found),
|
||||||
|
"dest_path": str(dst),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
missing += 1
|
||||||
|
manifest.append(
|
||||||
|
{
|
||||||
|
"candidate_id": cand_id,
|
||||||
|
"name": name,
|
||||||
|
"resume_id": resume_id,
|
||||||
|
"copied": False,
|
||||||
|
"reason": f"copy_failed: {repr(e)}",
|
||||||
|
"source_path": str(src_found),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# merged TXT
|
||||||
|
if merge_text:
|
||||||
|
merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
|
||||||
|
|
||||||
|
# NEW: merged PDF from files/*.pdf
|
||||||
|
merged_pdf_path: Optional[Path] = None
|
||||||
|
pdf_info: Optional[Dict[str, Any]] = None
|
||||||
|
if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
|
||||||
|
try:
|
||||||
|
merged_pdf_path = out_root / "pdf" / "merged.pdf"
|
||||||
|
_safe_mkdir(merged_pdf_path.parent)
|
||||||
|
pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
|
||||||
|
except Exception as e:
|
||||||
|
pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
|
||||||
|
|
||||||
|
# manifest.json
|
||||||
|
(out_root / "manifest.json").write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"out_dir": str(out_root),
|
||||||
|
"copied_files": copied,
|
||||||
|
"missing_files": missing,
|
||||||
|
"merged_text": str(merged_txt_path) if merge_text else None,
|
||||||
|
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
|
||||||
|
"pdf_info": pdf_info,
|
||||||
|
"items": manifest,
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
indent=2,
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
errors="ignore",
|
||||||
|
)
|
||||||
|
|
||||||
|
# README
|
||||||
|
readme_lines = [
|
||||||
|
"Папка создана командой search.",
|
||||||
|
"- files/: скопированные исходные файлы резюме",
|
||||||
|
"- merged_resumes.txt: склейка текста clean_text из БД",
|
||||||
|
"- manifest.json: что откуда скопировалось / что не найдено",
|
||||||
|
]
|
||||||
|
if merge_pdf:
|
||||||
|
if merge_all_pdfs_in_dir is None:
|
||||||
|
readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
|
||||||
|
else:
|
||||||
|
readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
|
||||||
|
|
||||||
|
(out_root / "README.txt").write_text(
|
||||||
|
"\n".join(readme_lines) + "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
errors="ignore",
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"out_dir": str(out_root),
|
||||||
|
"copied_files": copied,
|
||||||
|
"missing_files": missing,
|
||||||
|
"merged_text": str(merged_txt_path) if merge_text else None,
|
||||||
|
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
|
||||||
|
"manifest": str(out_root / "manifest.json"),
|
||||||
|
"pdf_info": pdf_info,
|
||||||
|
}
|
||||||
282
cli.py
Normal file
282
cli.py
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from tg_resume_db.bundle_export import bundle_search_results
|
||||||
|
from tg_resume_db.db import connect, init_db
|
||||||
|
from tg_resume_db.pipeline import import_exports as run_import
|
||||||
|
from tg_resume_db.search import search as run_search
|
||||||
|
from tg_resume_db.util import Logger
|
||||||
|
from tg_resume_db.extract.text_extract import extract_text as extract_text_generic
|
||||||
|
from tg_resume_db.extract.pdf_extract import extract_pdf_best
|
||||||
|
from tg_resume_db.extract.clean import normalize_text
|
||||||
|
from tg_resume_db.extract.doc_type import detect_doc_type
|
||||||
|
from tg_resume_db.extract.sections import split_sections, sections_present
|
||||||
|
from tg_resume_db.extract.parse import extract_name_guess
|
||||||
|
|
||||||
|
|
||||||
|
def _print_json(obj: Dict[str, Any]) -> None:
|
||||||
|
s = json.dumps(obj, ensure_ascii=False, indent=2)
|
||||||
|
try:
|
||||||
|
print(s)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
# Fallback for cp1251/legacy consoles.
|
||||||
|
print(s.encode("ascii", "backslashreplace").decode("ascii"))
|
||||||
|
|
||||||
|
|
||||||
|
def _is_interactive() -> bool:
|
||||||
|
return sys.stdin.isatty() and sys.stdout.isatty()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(prog="tg_resume_db")
|
||||||
|
sub = ap.add_subparsers(dest="cmd", required=True)
|
||||||
|
|
||||||
|
# ---------------- import_exports ----------------
|
||||||
|
imp = sub.add_parser("import_exports", help="Import Telegram exports recursively (incremental)")
|
||||||
|
imp.add_argument("--input", required=True, help="Path to exports directory")
|
||||||
|
imp.add_argument("--db", required=True, help="SQLite db path")
|
||||||
|
imp.add_argument("--log", default="./import.log", help="Log file path")
|
||||||
|
imp.add_argument("--near-dist", type=int, default=6, help="Simhash max Hamming distance for near-duplicates")
|
||||||
|
imp.add_argument("--min-text-len", type=int, default=250, help="Skip very short texts")
|
||||||
|
imp.add_argument(
|
||||||
|
"--llm",
|
||||||
|
choices=["auto", "off", "force"],
|
||||||
|
default="auto",
|
||||||
|
help="LLM enrichment mode: auto (default), off to disable, force to always run when configured",
|
||||||
|
)
|
||||||
|
imp.add_argument(
|
||||||
|
"--llm-review",
|
||||||
|
choices=["always", "auto", "off"],
|
||||||
|
default="always",
|
||||||
|
help="LLM review mode for parsed JSON: always (default), auto, off",
|
||||||
|
)
|
||||||
|
imp.add_argument(
|
||||||
|
"--llm-review-rounds",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="How many LLM review merge rounds to run per resume (1..3)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------- search ----------------
|
||||||
|
s = sub.add_parser("search", help="Search candidates")
|
||||||
|
s.add_argument("--db", required=True)
|
||||||
|
s.add_argument("--query", required=True)
|
||||||
|
s.add_argument("--limit", type=int, default=20)
|
||||||
|
s.add_argument("--offset", type=int, default=0)
|
||||||
|
s.add_argument("--remote", choices=["true", "false"], default=None)
|
||||||
|
s.add_argument("--location", default=None)
|
||||||
|
s.add_argument("--experience-min", type=float, default=None)
|
||||||
|
s.add_argument("--salary-min", type=int, default=None)
|
||||||
|
s.add_argument("--salary-max", type=int, default=None)
|
||||||
|
s.add_argument("--english", default=None)
|
||||||
|
s.add_argument("--doc-type", default=None)
|
||||||
|
|
||||||
|
# AI mode
|
||||||
|
s.add_argument("--ai", action="store_true", help="Use LLM to build filters from text query and run search")
|
||||||
|
s.add_argument("--ai-iters", type=int, default=2, help="How many refine iterations for AI search")
|
||||||
|
|
||||||
|
# Backward compatible single-value filters
|
||||||
|
s.add_argument("--role", default=None, help="Single role (backward compatible); prefer --roles-any")
|
||||||
|
s.add_argument("--skill", default=None, help="Single skill (backward compatible); prefer --skills-any/--skills-all")
|
||||||
|
|
||||||
|
# Stack filters (comma-separated)
|
||||||
|
s.add_argument("--roles-any", default=None, help="Comma-separated roles; at least one must match")
|
||||||
|
s.add_argument("--skills-any", default=None, help="Comma-separated skills; at least one must match")
|
||||||
|
s.add_argument("--skills-all", default=None, help="Comma-separated skills; all must match")
|
||||||
|
|
||||||
|
# Bundle export behavior
|
||||||
|
s.add_argument("--bundle", choices=["ask", "yes", "no"], default="ask", help="Bundle found resumes into a folder")
|
||||||
|
|
||||||
|
# ---------------- inspect ----------------
|
||||||
|
ins = sub.add_parser("inspect", help="Inspect a single resume file (doc_type/sections)")
|
||||||
|
ins.add_argument("--file", required=True, help="Path to resume file")
|
||||||
|
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
# ========================= import_exports =========================
|
||||||
|
if args.cmd == "import_exports":
|
||||||
|
con = connect(args.db)
|
||||||
|
try:
|
||||||
|
init_db(con)
|
||||||
|
log = Logger(args.log)
|
||||||
|
|
||||||
|
prev_enabled = os.environ.get("LLM_PARSE_ENABLED")
|
||||||
|
prev_force = os.environ.get("LLM_PARSE_FORCE")
|
||||||
|
prev_review_mode = os.environ.get("LLM_PARSE_REVIEW_MODE")
|
||||||
|
prev_review_rounds = os.environ.get("LLM_PARSE_REVIEW_ROUNDS")
|
||||||
|
try:
|
||||||
|
if args.llm == "off":
|
||||||
|
os.environ["LLM_PARSE_ENABLED"] = "0"
|
||||||
|
os.environ["LLM_PARSE_REVIEW_MODE"] = "off"
|
||||||
|
elif args.llm == "force":
|
||||||
|
os.environ["LLM_PARSE_ENABLED"] = "1"
|
||||||
|
os.environ["LLM_PARSE_FORCE"] = "1"
|
||||||
|
os.environ["LLM_PARSE_REVIEW_MODE"] = "always"
|
||||||
|
else:
|
||||||
|
os.environ["LLM_PARSE_REVIEW_MODE"] = args.llm_review
|
||||||
|
|
||||||
|
rounds = max(1, min(int(args.llm_review_rounds), 3))
|
||||||
|
os.environ["LLM_PARSE_REVIEW_ROUNDS"] = str(rounds)
|
||||||
|
stats = run_import(
|
||||||
|
con=con,
|
||||||
|
input_dir=args.input,
|
||||||
|
log=log,
|
||||||
|
max_near_dist=args.near_dist,
|
||||||
|
min_text_len=args.min_text_len,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
if args.llm == "off":
|
||||||
|
if prev_enabled is None:
|
||||||
|
os.environ.pop("LLM_PARSE_ENABLED", None)
|
||||||
|
else:
|
||||||
|
os.environ["LLM_PARSE_ENABLED"] = prev_enabled
|
||||||
|
elif args.llm == "force":
|
||||||
|
if prev_enabled is None:
|
||||||
|
os.environ.pop("LLM_PARSE_ENABLED", None)
|
||||||
|
else:
|
||||||
|
os.environ["LLM_PARSE_ENABLED"] = prev_enabled
|
||||||
|
if prev_force is None:
|
||||||
|
os.environ.pop("LLM_PARSE_FORCE", None)
|
||||||
|
else:
|
||||||
|
os.environ["LLM_PARSE_FORCE"] = prev_force
|
||||||
|
if prev_review_mode is None:
|
||||||
|
os.environ.pop("LLM_PARSE_REVIEW_MODE", None)
|
||||||
|
else:
|
||||||
|
os.environ["LLM_PARSE_REVIEW_MODE"] = prev_review_mode
|
||||||
|
if prev_review_rounds is None:
|
||||||
|
os.environ.pop("LLM_PARSE_REVIEW_ROUNDS", None)
|
||||||
|
else:
|
||||||
|
os.environ["LLM_PARSE_REVIEW_ROUNDS"] = prev_review_rounds
|
||||||
|
finally:
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
_print_json(stats)
|
||||||
|
return
|
||||||
|
|
||||||
|
# ============================= search =============================
|
||||||
|
if args.cmd == "search":
|
||||||
|
con = connect(args.db)
|
||||||
|
init_db(con) # важно: гарантирует, что resumes_fts и триггеры существуют
|
||||||
|
|
||||||
|
try:
|
||||||
|
items: list[Dict[str, Any]] = []
|
||||||
|
out: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
if args.ai:
|
||||||
|
from tg_resume_db.agent import agent_search
|
||||||
|
|
||||||
|
res = agent_search(
|
||||||
|
con,
|
||||||
|
user_prompt=args.query,
|
||||||
|
max_iters=args.ai_iters,
|
||||||
|
)
|
||||||
|
|
||||||
|
items = res.get("items", [])
|
||||||
|
out = {
|
||||||
|
"ai": True,
|
||||||
|
"llm_used": res.get("llm_used", False),
|
||||||
|
"plan": res.get("plan"),
|
||||||
|
"history": res.get("history"),
|
||||||
|
"postfilter": res.get("postfilter"),
|
||||||
|
"items": items,
|
||||||
|
"count": res.get("count", len(items)),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
filters = {
|
||||||
|
"remote": (args.remote == "true") if args.remote is not None else None,
|
||||||
|
"location": args.location,
|
||||||
|
"experience_min": args.experience_min,
|
||||||
|
"salary_min": args.salary_min,
|
||||||
|
"salary_max": args.salary_max,
|
||||||
|
"english": args.english,
|
||||||
|
"doc_type": args.doc_type,
|
||||||
|
# backward compat
|
||||||
|
"role": args.role,
|
||||||
|
"skill": args.skill,
|
||||||
|
# new
|
||||||
|
"roles_any": args.roles_any,
|
||||||
|
"skills_any": args.skills_any,
|
||||||
|
"skills_all": args.skills_all,
|
||||||
|
}
|
||||||
|
|
||||||
|
items = run_search(
|
||||||
|
con,
|
||||||
|
query=args.query,
|
||||||
|
filters=filters,
|
||||||
|
limit=args.limit,
|
||||||
|
offset=args.offset,
|
||||||
|
)
|
||||||
|
out = {"ai": False, "items": items, "count": len(items)}
|
||||||
|
|
||||||
|
# 1) печатаем результаты
|
||||||
|
_print_json(out)
|
||||||
|
|
||||||
|
# 2) bundle prompt/flag
|
||||||
|
if args.bundle == "yes":
|
||||||
|
do_bundle = True
|
||||||
|
elif args.bundle == "no":
|
||||||
|
do_bundle = False
|
||||||
|
else: # ask
|
||||||
|
do_bundle = False
|
||||||
|
if _is_interactive():
|
||||||
|
ans = input("\nСобрать найденные резюме в папку? (Y/N): ").strip().lower()
|
||||||
|
do_bundle = ans in ("y", "yes", "да", "д")
|
||||||
|
|
||||||
|
if do_bundle:
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
out_dir = f"./bundle_{ts}"
|
||||||
|
info = bundle_search_results(con, items, out_dir, copy_files=True, merge_text=True)
|
||||||
|
print(f"\n[done] Готово: {info['out_dir']}")
|
||||||
|
print(f" files copied: {info['copied_files']}, missing: {info['missing_files']}")
|
||||||
|
print(f" merged: {info['merged_text']}")
|
||||||
|
print(f" manifest: {info['manifest']}")
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
finally:
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
# ============================= inspect =============================
|
||||||
|
if args.cmd == "inspect":
|
||||||
|
fp = args.file
|
||||||
|
path = Path(fp)
|
||||||
|
extract_meta = {}
|
||||||
|
if path.suffix.lower() == ".pdf":
|
||||||
|
pdf_res = extract_pdf_best(path, timeout_sec=25)
|
||||||
|
raw_text = pdf_res.text
|
||||||
|
extract_meta = {
|
||||||
|
"method": pdf_res.method,
|
||||||
|
"quality_score": pdf_res.score,
|
||||||
|
"quality_flags": pdf_res.flags,
|
||||||
|
"pages": len(pdf_res.pages),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raw_text = extract_text_generic(path)
|
||||||
|
extract_meta = {"method": "generic"}
|
||||||
|
|
||||||
|
clean = normalize_text(raw_text or "")
|
||||||
|
dt = detect_doc_type(clean, file_ext=Path(fp).suffix.lower())
|
||||||
|
secs = split_sections(clean, dt.doc_type)
|
||||||
|
out = {
|
||||||
|
"file": fp,
|
||||||
|
"doc_type": dt.doc_type,
|
||||||
|
"confidence": dt.confidence,
|
||||||
|
"signals": dt.signals,
|
||||||
|
"extract": extract_meta,
|
||||||
|
"sections_present": sections_present(secs),
|
||||||
|
"name_guess": extract_name_guess(clean),
|
||||||
|
}
|
||||||
|
_print_json(out)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
296
db.py
Normal file
296
db.py
Normal file
@@ -0,0 +1,296 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
SCHEMA = r"""
|
||||||
|
PRAGMA journal_mode=WAL;
|
||||||
|
PRAGMA synchronous=NORMAL;
|
||||||
|
PRAGMA temp_store=MEMORY;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS candidates (
|
||||||
|
candidate_id TEXT PRIMARY KEY,
|
||||||
|
name TEXT,
|
||||||
|
location TEXT,
|
||||||
|
remote INTEGER,
|
||||||
|
experience_years REAL,
|
||||||
|
experience_years_eng REAL, -- инженерный опыт (после фильтра HR)
|
||||||
|
experience_confidence REAL,
|
||||||
|
salary_min INTEGER,
|
||||||
|
salary_max INTEGER,
|
||||||
|
salary_confidence REAL,
|
||||||
|
english_level TEXT,
|
||||||
|
roles_json TEXT,
|
||||||
|
skills_json TEXT,
|
||||||
|
primary_languages_json TEXT,
|
||||||
|
backend_focus INTEGER,
|
||||||
|
roles_norm TEXT, -- "|backend|devops|"
|
||||||
|
skills_norm TEXT, -- "|python|k8s|"
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
updated_at TEXT DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS candidate_contacts (
|
||||||
|
contact_type TEXT NOT NULL, -- email/phone/tg/github/linkedin/url
|
||||||
|
contact_value TEXT NOT NULL, -- normalized
|
||||||
|
candidate_id TEXT NOT NULL,
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY(contact_type, contact_value),
|
||||||
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resumes (
|
||||||
|
resume_id TEXT PRIMARY KEY,
|
||||||
|
candidate_id TEXT NOT NULL,
|
||||||
|
sha256 TEXT,
|
||||||
|
simhash TEXT,
|
||||||
|
clean_text TEXT NOT NULL,
|
||||||
|
raw_text TEXT,
|
||||||
|
extraction_json TEXT,
|
||||||
|
llm_summary TEXT,
|
||||||
|
llm_tags_json TEXT,
|
||||||
|
extract_method TEXT,
|
||||||
|
extract_quality_score REAL,
|
||||||
|
extract_quality_flags TEXT,
|
||||||
|
extract_pages_json TEXT,
|
||||||
|
doc_type TEXT,
|
||||||
|
doc_type_confidence REAL,
|
||||||
|
parse_method TEXT,
|
||||||
|
parse_version TEXT,
|
||||||
|
sections_json TEXT,
|
||||||
|
is_active INTEGER DEFAULT 1,
|
||||||
|
duplicate_of_resume_id TEXT,
|
||||||
|
file_path TEXT,
|
||||||
|
file_mtime INTEGER,
|
||||||
|
file_size INTEGER,
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS sources (
|
||||||
|
source_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
resume_id TEXT NOT NULL,
|
||||||
|
export_path TEXT,
|
||||||
|
chat_title TEXT,
|
||||||
|
message_id TEXT,
|
||||||
|
message_date TEXT,
|
||||||
|
origin_type TEXT,
|
||||||
|
original_file_path TEXT,
|
||||||
|
original_file_name TEXT,
|
||||||
|
extra_json TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS files_seen (
|
||||||
|
sha256 TEXT PRIMARY KEY,
|
||||||
|
size INTEGER,
|
||||||
|
mtime INTEGER,
|
||||||
|
canonical_resume_id TEXT,
|
||||||
|
first_seen_at TEXT DEFAULT (datetime('now')),
|
||||||
|
last_seen_at TEXT DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS simhash_buckets (
|
||||||
|
bucket INTEGER NOT NULL,
|
||||||
|
band INTEGER NOT NULL,
|
||||||
|
resume_id TEXT NOT NULL,
|
||||||
|
PRIMARY KEY(bucket, band, resume_id),
|
||||||
|
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS candidate_skills (
|
||||||
|
candidate_id TEXT NOT NULL,
|
||||||
|
skill_id TEXT NOT NULL,
|
||||||
|
skill_label TEXT,
|
||||||
|
confidence REAL,
|
||||||
|
source TEXT,
|
||||||
|
evidence TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY(candidate_id, skill_id),
|
||||||
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS candidate_roles (
|
||||||
|
candidate_id TEXT NOT NULL,
|
||||||
|
role TEXT NOT NULL,
|
||||||
|
confidence REAL,
|
||||||
|
source TEXT,
|
||||||
|
evidence TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY(candidate_id, role),
|
||||||
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS candidate_languages (
|
||||||
|
candidate_id TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
level TEXT,
|
||||||
|
confidence REAL,
|
||||||
|
source TEXT,
|
||||||
|
evidence TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY(candidate_id, language),
|
||||||
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS positions (
|
||||||
|
position_id TEXT PRIMARY KEY,
|
||||||
|
resume_id TEXT NOT NULL,
|
||||||
|
candidate_id TEXT NOT NULL,
|
||||||
|
title TEXT,
|
||||||
|
company TEXT,
|
||||||
|
date_from TEXT,
|
||||||
|
date_to TEXT,
|
||||||
|
is_current INTEGER,
|
||||||
|
description TEXT,
|
||||||
|
stack_json TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now')),
|
||||||
|
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id),
|
||||||
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS llm_cache (
|
||||||
|
cache_key TEXT PRIMARY KEY,
|
||||||
|
model TEXT,
|
||||||
|
result_json TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Full-text index (FTS5): contentless
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5(
|
||||||
|
resume_id UNINDEXED,
|
||||||
|
candidate_id UNINDEXED,
|
||||||
|
clean_text,
|
||||||
|
tokenize='unicode61 remove_diacritics 2'
|
||||||
|
);
|
||||||
|
|
||||||
|
-- --- Triggers to keep FTS synced with resumes ---
|
||||||
|
-- Insert
|
||||||
|
CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts
|
||||||
|
AFTER INSERT ON resumes
|
||||||
|
BEGIN
|
||||||
|
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
|
||||||
|
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
||||||
|
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
|
||||||
|
WHERE NEW.is_active = 1;
|
||||||
|
END;
|
||||||
|
|
||||||
|
-- Delete
|
||||||
|
CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts
|
||||||
|
AFTER DELETE ON resumes
|
||||||
|
BEGIN
|
||||||
|
DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id;
|
||||||
|
END;
|
||||||
|
|
||||||
|
-- Update (text/active/candidate)
|
||||||
|
CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts
|
||||||
|
AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes
|
||||||
|
BEGIN
|
||||||
|
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
|
||||||
|
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
||||||
|
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
|
||||||
|
WHERE NEW.is_active = 1;
|
||||||
|
END;
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def connect(db_path: str) -> sqlite3.Connection:
|
||||||
|
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
con = sqlite3.connect(db_path)
|
||||||
|
con.row_factory = sqlite3.Row
|
||||||
|
return con
|
||||||
|
|
||||||
|
|
||||||
|
def _table_exists(con: sqlite3.Connection, name: str) -> bool:
|
||||||
|
row = con.execute(
|
||||||
|
"SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1",
|
||||||
|
(name,),
|
||||||
|
).fetchone()
|
||||||
|
return row is not None
|
||||||
|
|
||||||
|
|
||||||
|
def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool:
|
||||||
|
cur = con.execute(f"PRAGMA table_info({table})")
|
||||||
|
for r in cur.fetchall():
|
||||||
|
if r["name"] == column:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None:
|
||||||
|
if not _table_exists(con, table):
|
||||||
|
return
|
||||||
|
if _column_exists(con, table, column):
|
||||||
|
return
|
||||||
|
con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}")
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_fts_backfilled(con: sqlite3.Connection) -> None:
|
||||||
|
"""
|
||||||
|
Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes.
|
||||||
|
Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0.
|
||||||
|
"""
|
||||||
|
if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"):
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"])
|
||||||
|
fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"])
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
|
||||||
|
if resumes_cnt <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Любое несовпадение -> rebuild (убирает и пустоту, и дубли)
|
||||||
|
if fts_cnt != resumes_cnt:
|
||||||
|
con.execute("DELETE FROM resumes_fts")
|
||||||
|
con.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
||||||
|
SELECT resume_id, candidate_id, clean_text
|
||||||
|
FROM resumes
|
||||||
|
WHERE is_active=1
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
con.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def init_db(con: sqlite3.Connection) -> None:
|
||||||
|
con.executescript(SCHEMA)
|
||||||
|
# Lightweight migrations for existing DBs (safe to re-run)
|
||||||
|
_add_column_if_missing(con, "candidates", "experience_years_eng", "REAL")
|
||||||
|
_add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT")
|
||||||
|
_add_column_if_missing(con, "candidates", "backend_focus", "INTEGER")
|
||||||
|
_add_column_if_missing(con, "resumes", "llm_summary", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "extract_method", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "extract_quality_score", "REAL")
|
||||||
|
_add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "doc_type", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL")
|
||||||
|
_add_column_if_missing(con, "resumes", "parse_method", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "parse_version", "TEXT")
|
||||||
|
_add_column_if_missing(con, "resumes", "sections_json", "TEXT")
|
||||||
|
if not _table_exists(con, "llm_cache"):
|
||||||
|
con.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS llm_cache (
|
||||||
|
cache_key TEXT PRIMARY KEY,
|
||||||
|
model TEXT,
|
||||||
|
result_json TEXT,
|
||||||
|
created_at TEXT DEFAULT (datetime('now'))
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
con.commit()
|
||||||
|
_ensure_fts_backfilled(con)
|
||||||
41
dedup/simhash.py
Normal file
41
dedup/simhash.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str:
|
||||||
|
h = hashlib.sha256()
|
||||||
|
for chunk in iter(lambda: f.read(chunk_size), b""):
|
||||||
|
h.update(chunk)
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
def sha256_file(path) -> str:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
return sha256_file_bytes_iter(f)
|
||||||
|
|
||||||
|
def sha1_str(s: str) -> str:
|
||||||
|
return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()
|
||||||
|
|
||||||
|
def simhash64(text: str) -> int:
|
||||||
|
tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower())
|
||||||
|
if not tokens:
|
||||||
|
return 0
|
||||||
|
v = [0] * 64
|
||||||
|
for tok in tokens:
|
||||||
|
h = hashlib.md5(tok.encode("utf-8")).digest()
|
||||||
|
x = int.from_bytes(h[:8], "big", signed=False)
|
||||||
|
for i in range(64):
|
||||||
|
v[i] += 1 if ((x >> i) & 1) else -1
|
||||||
|
out = 0
|
||||||
|
for i in range(64):
|
||||||
|
if v[i] > 0:
|
||||||
|
out |= (1 << i)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def hamming64(a: int, b: int) -> int:
|
||||||
|
return (a ^ b).bit_count()
|
||||||
|
|
||||||
|
def simhash_bands(x: int) -> List[Tuple[int, int]]:
|
||||||
|
# 4 bands x 16 bits
|
||||||
|
return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]
|
||||||
39
extract/clean.py
Normal file
39
extract/clean.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I)
|
||||||
|
RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U)
|
||||||
|
RE_MULTI_SPACE = re.compile(r"[ \t]+")
|
||||||
|
RE_MULTI_NL = re.compile(r"\n{3,}")
|
||||||
|
|
||||||
|
_INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"]
|
||||||
|
_BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]")
|
||||||
|
|
||||||
|
def normalize_text(raw: str) -> str:
|
||||||
|
text = raw.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
for ch in _INVISIBLE_CHARS:
|
||||||
|
text = text.replace(ch, "")
|
||||||
|
text = _BIDI_CTRL_RE.sub("", text)
|
||||||
|
# remove most control/format chars but keep line breaks and tabs
|
||||||
|
text = "".join(
|
||||||
|
ch for ch in text
|
||||||
|
if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C")
|
||||||
|
)
|
||||||
|
text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b"))
|
||||||
|
lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")]
|
||||||
|
lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)]
|
||||||
|
counts = Counter(lines)
|
||||||
|
filtered = []
|
||||||
|
for ln in lines:
|
||||||
|
if counts[ln] >= 4 and len(ln) <= 90:
|
||||||
|
continue
|
||||||
|
filtered.append(ln)
|
||||||
|
text = "\n".join(filtered)
|
||||||
|
text = RE_MULTI_NL.sub("\n\n", text).strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
def to_fts_text(clean: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", clean).strip()
|
||||||
134
extract/doc_type.py
Normal file
134
extract/doc_type.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DocTypeResult:
|
||||||
|
doc_type: str
|
||||||
|
confidence: float
|
||||||
|
signals: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
_HH_PATTERNS = [
|
||||||
|
(re.compile(r"\bhh\.ru\b", re.I), 2.0, "hh_domain"),
|
||||||
|
(re.compile(r"\bheadhunter\b", re.I), 2.0, "headhunter"),
|
||||||
|
(re.compile(r"\bрезюме\s+обновлено\b", re.I), 2.5, "hh_resume_updated"),
|
||||||
|
(re.compile(r"\bжелаемая\s+должность\b", re.I), 1.2, "hh_desired_role"),
|
||||||
|
(re.compile(r"\bключевые\s+навыки\b", re.I), 1.0, "hh_key_skills"),
|
||||||
|
(re.compile(r"\bопыт\s+работы\b", re.I), 0.8, "hh_experience"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_LI_PATTERNS = [
|
||||||
|
(re.compile(r"\blinkedin\b", re.I), 2.5, "li_brand"),
|
||||||
|
(re.compile(r"\blinkedin\.com\b", re.I), 2.0, "li_domain"),
|
||||||
|
(re.compile(r"\bexperience\b", re.I), 0.9, "li_experience"),
|
||||||
|
(re.compile(r"\beducation\b", re.I), 0.9, "li_education"),
|
||||||
|
(re.compile(r"\bskills\b", re.I), 0.9, "li_skills"),
|
||||||
|
(re.compile(r"\babout\b", re.I), 0.6, "li_about"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_PPTX_PATTERNS = [
|
||||||
|
(re.compile(r"\bslide\b", re.I), 1.0, "pptx_slide"),
|
||||||
|
(re.compile(r"\bpowerpoint\b", re.I), 1.3, "pptx_powerpoint"),
|
||||||
|
(re.compile(r"\bpptx\b", re.I), 1.3, "pptx_ext"),
|
||||||
|
(re.compile(r"\bpresentation\b", re.I), 0.8, "pptx_presentation"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _score_patterns(text: str, patterns) -> tuple[float, List[str]]:
|
||||||
|
score = 0.0
|
||||||
|
signals: List[str] = []
|
||||||
|
for rx, weight, name in patterns:
|
||||||
|
if rx.search(text):
|
||||||
|
score += weight
|
||||||
|
signals.append(name)
|
||||||
|
return score, signals
|
||||||
|
|
||||||
|
|
||||||
|
def _confidence_from_score(score: float) -> float:
|
||||||
|
if score >= 4.0:
|
||||||
|
return 0.92
|
||||||
|
if score >= 3.0:
|
||||||
|
return 0.85
|
||||||
|
if score >= 2.0:
|
||||||
|
return 0.75
|
||||||
|
if score >= 1.2:
|
||||||
|
return 0.62
|
||||||
|
if score > 0.0:
|
||||||
|
return 0.50
|
||||||
|
return 0.30
|
||||||
|
|
||||||
|
|
||||||
|
def detect_doc_type(clean_text: str, file_ext: Optional[str] = None) -> DocTypeResult:
|
||||||
|
lines = [ln.strip() for ln in (clean_text or "").splitlines() if ln.strip()]
|
||||||
|
head_lines = lines[:80]
|
||||||
|
head_text = "\n".join(head_lines)
|
||||||
|
head_lc = head_text.lower()
|
||||||
|
|
||||||
|
signals: List[str] = []
|
||||||
|
|
||||||
|
hh_score, hh_signals = _score_patterns(head_text, _HH_PATTERNS)
|
||||||
|
li_score, li_signals = _score_patterns(head_text, _LI_PATTERNS)
|
||||||
|
pptx_score, pptx_signals = _score_patterns(head_text, _PPTX_PATTERNS)
|
||||||
|
if file_ext and file_ext.lower() in (".pptx", ".ppt"):
|
||||||
|
pptx_score += 2.0
|
||||||
|
signals.append("pptx_ext")
|
||||||
|
|
||||||
|
signals.extend(hh_signals + li_signals + pptx_signals)
|
||||||
|
|
||||||
|
# One-page heuristic: short resumes with dense content
|
||||||
|
total_len = len(clean_text or "")
|
||||||
|
one_page_score = 0.0
|
||||||
|
if len(lines) <= 70 and total_len <= 4500:
|
||||||
|
one_page_score = 2.2
|
||||||
|
signals.append("one_page_short")
|
||||||
|
elif len(lines) <= 90 and total_len <= 6500:
|
||||||
|
one_page_score = 1.6
|
||||||
|
signals.append("one_page_medium")
|
||||||
|
|
||||||
|
# Scan heuristic: very low textual content
|
||||||
|
letters = sum(ch.isalpha() for ch in clean_text or "")
|
||||||
|
total = max(1, len(clean_text or ""))
|
||||||
|
letter_ratio = letters / total
|
||||||
|
scan_score = 0.0
|
||||||
|
if total_len < 200 or letter_ratio < 0.12:
|
||||||
|
scan_score = 3.2
|
||||||
|
signals.append("scan_low_text")
|
||||||
|
if file_ext and file_ext.lower() in (".pdf", ".png", ".jpg", ".jpeg", ".tiff"):
|
||||||
|
scan_score += 0.6
|
||||||
|
signals.append("scan_file_ext")
|
||||||
|
|
||||||
|
candidates = [
|
||||||
|
("hh_ru", hh_score),
|
||||||
|
("linkedin_pdf", li_score),
|
||||||
|
("pptx_export", pptx_score),
|
||||||
|
("one_page", one_page_score),
|
||||||
|
("scan_pdf", scan_score),
|
||||||
|
]
|
||||||
|
doc_type, best_score = max(candidates, key=lambda x: x[1])
|
||||||
|
|
||||||
|
if best_score <= 0.0:
|
||||||
|
base = "generic_pdf" if (file_ext and file_ext.lower() == ".pdf") else "generic"
|
||||||
|
return DocTypeResult(doc_type=base, confidence=0.35, signals=signals)
|
||||||
|
|
||||||
|
confidence = _confidence_from_score(best_score)
|
||||||
|
# If scan is detected strongly, prefer it
|
||||||
|
if doc_type == "scan_pdf" and confidence >= 0.8:
|
||||||
|
return DocTypeResult(doc_type="scan_pdf", confidence=confidence, signals=signals)
|
||||||
|
|
||||||
|
# Split one-page into ru/en
|
||||||
|
if doc_type == "one_page":
|
||||||
|
if _looks_cyrillic(head_text):
|
||||||
|
return DocTypeResult(doc_type="one_page_ru", confidence=confidence, signals=signals)
|
||||||
|
return DocTypeResult(doc_type="one_page_en", confidence=confidence, signals=signals)
|
||||||
|
|
||||||
|
return DocTypeResult(doc_type=doc_type, confidence=confidence, signals=signals)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_cyrillic(text: str) -> bool:
|
||||||
|
cyr = len(re.findall(r"[А-Яа-яЁё]", text))
|
||||||
|
lat = len(re.findall(r"[A-Za-z]", text))
|
||||||
|
return cyr > lat and cyr >= 10
|
||||||
159
extract/experience.py
Normal file
159
extract/experience.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import date
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
# Month maps (EN + RU)
|
||||||
|
MONTHS = {
|
||||||
|
"jan": 1, "january": 1, "янв": 1, "январ": 1,
|
||||||
|
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
|
||||||
|
"mar": 3, "march": 3, "мар": 3, "март": 3,
|
||||||
|
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
|
||||||
|
"may": 5, "май": 5,
|
||||||
|
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
|
||||||
|
"jul": 7, "july": 7, "июл": 7, "июль": 7,
|
||||||
|
"aug": 8, "august": 8, "авг": 8, "август": 8,
|
||||||
|
"sep": 9, "september": 9, "сен": 9, "сент": 9,
|
||||||
|
"oct": 10, "october": 10, "окт": 10, "октя": 10,
|
||||||
|
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
|
||||||
|
"dec": 12, "december": 12, "дек": 12, "дека": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
|
||||||
|
|
||||||
|
# Direct "X years" patterns
|
||||||
|
DIRECT_YEARS_RE = re.compile(r"(\d+(?:[.,]\d+)?)\s*(?:\+?\s*)?(?:years?|yrs?|лет|года|год)\b", re.I)
|
||||||
|
|
||||||
|
# Dates like 03.2019, 2019, Jan 2020, янв 2020
|
||||||
|
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
|
||||||
|
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
|
||||||
|
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
|
||||||
|
|
||||||
|
# Range separators
|
||||||
|
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExpResult:
|
||||||
|
years: Optional[float]
|
||||||
|
confidence: float
|
||||||
|
debug: Dict
|
||||||
|
|
||||||
|
def _clamp_years(y: float) -> Optional[float]:
|
||||||
|
if 0.0 <= y <= 45.0:
|
||||||
|
return y
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_mon(mon: str) -> Optional[int]:
|
||||||
|
m = mon.strip().lower()
|
||||||
|
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
|
||||||
|
# allow prefixes: "январ", "феврал"
|
||||||
|
for k, v in MONTHS.items():
|
||||||
|
if m.startswith(k):
|
||||||
|
return v
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _as_ymd(y: int, m: int) -> date:
|
||||||
|
return date(y, m, 1)
|
||||||
|
|
||||||
|
def _parse_one_date(s: str) -> Optional[date]:
|
||||||
|
s = s.strip()
|
||||||
|
if PRESENT_RE.search(s):
|
||||||
|
today = date.today()
|
||||||
|
return date(today.year, today.month, 1)
|
||||||
|
|
||||||
|
m1 = MMYYYY_RE.search(s)
|
||||||
|
if m1:
|
||||||
|
mm = int(m1.group(1))
|
||||||
|
yy = int(m1.group(2))
|
||||||
|
return _as_ymd(yy, mm)
|
||||||
|
|
||||||
|
m2 = MON_YYYY_RE.search(s)
|
||||||
|
if m2:
|
||||||
|
mon = _parse_mon(m2.group(1))
|
||||||
|
yy = int(m2.group(2))
|
||||||
|
if mon:
|
||||||
|
return _as_ymd(yy, mon)
|
||||||
|
|
||||||
|
m3 = YYYY_RE.search(s)
|
||||||
|
if m3:
|
||||||
|
yy = int(m3.group(1))
|
||||||
|
return _as_ymd(yy, 1)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _merge_intervals(intervals: List[Tuple[date, date]]) -> List[Tuple[date, date]]:
|
||||||
|
if not intervals:
|
||||||
|
return []
|
||||||
|
intervals = sorted(intervals, key=lambda x: (x[0], x[1]))
|
||||||
|
merged = [intervals[0]]
|
||||||
|
for s, e in intervals[1:]:
|
||||||
|
ls, le = merged[-1]
|
||||||
|
if s <= le:
|
||||||
|
merged[-1] = (ls, max(le, e))
|
||||||
|
else:
|
||||||
|
merged.append((s, e))
|
||||||
|
return merged
|
||||||
|
|
||||||
|
def _months_between(a: date, b: date) -> int:
|
||||||
|
# month-level difference (inclusive-ish): b >= a
|
||||||
|
return (b.year - a.year) * 12 + (b.month - a.month)
|
||||||
|
|
||||||
|
def extract_experience(text: str) -> ExpResult:
|
||||||
|
debug: Dict = {"direct_matches": [], "ranges": [], "intervals": []}
|
||||||
|
|
||||||
|
# 1) Direct years
|
||||||
|
directs = []
|
||||||
|
for m in DIRECT_YEARS_RE.finditer(text):
|
||||||
|
try:
|
||||||
|
v = float(m.group(1).replace(",", "."))
|
||||||
|
if 0 <= v <= 45:
|
||||||
|
directs.append(v)
|
||||||
|
debug["direct_matches"].append({"match": m.group(0), "value": v})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if directs:
|
||||||
|
years = _clamp_years(max(directs))
|
||||||
|
return ExpResult(years=years, confidence=0.90, debug=debug)
|
||||||
|
|
||||||
|
# 2) Ranges in lines: try to detect "start - end"
|
||||||
|
intervals: List[Tuple[date, date]] = []
|
||||||
|
for line in text.splitlines():
|
||||||
|
ln = line.strip()
|
||||||
|
if len(ln) < 7:
|
||||||
|
continue
|
||||||
|
# require range separator
|
||||||
|
if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
|
||||||
|
continue
|
||||||
|
rr = RANGE_RE.match(ln)
|
||||||
|
if not rr:
|
||||||
|
continue
|
||||||
|
a = rr.group("a")
|
||||||
|
b = rr.group("b")
|
||||||
|
da = _parse_one_date(a)
|
||||||
|
db = _parse_one_date(b)
|
||||||
|
if da and db:
|
||||||
|
if db < da:
|
||||||
|
da, db = db, da
|
||||||
|
# cap extremely old
|
||||||
|
if da.year < 1990:
|
||||||
|
continue
|
||||||
|
intervals.append((da, db))
|
||||||
|
debug["ranges"].append({"line": ln, "start": da.isoformat(), "end": db.isoformat()})
|
||||||
|
|
||||||
|
intervals = _merge_intervals(intervals)
|
||||||
|
debug["intervals"] = [{"start": s.isoformat(), "end": e.isoformat()} for s, e in intervals]
|
||||||
|
|
||||||
|
if not intervals:
|
||||||
|
return ExpResult(years=None, confidence=0.0, debug=debug)
|
||||||
|
|
||||||
|
total_months = 0
|
||||||
|
for s, e in intervals:
|
||||||
|
total_months += max(0, _months_between(s, e))
|
||||||
|
years = round(total_months / 12.0, 2)
|
||||||
|
years = _clamp_years(years) if years is not None else None
|
||||||
|
|
||||||
|
# confidence depends on amount of evidence
|
||||||
|
conf = 0.70 if total_months >= 12 else 0.55
|
||||||
|
return ExpResult(years=years, confidence=conf, debug=debug)
|
||||||
144
extract/experience_timeline.py
Normal file
144
extract/experience_timeline.py
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from datetime import date
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
MONTHS = {
|
||||||
|
"jan": 1, "january": 1, "янв": 1, "январ": 1,
|
||||||
|
"feb": 2, "february": 2, "фев": 2, "феврал": 2,
|
||||||
|
"mar": 3, "march": 3, "мар": 3, "март": 3,
|
||||||
|
"apr": 4, "april": 4, "апр": 4, "апрел": 4,
|
||||||
|
"may": 5, "май": 5,
|
||||||
|
"jun": 6, "june": 6, "июн": 6, "июнь": 6,
|
||||||
|
"jul": 7, "july": 7, "июл": 7, "июль": 7,
|
||||||
|
"aug": 8, "august": 8, "авг": 8, "август": 8,
|
||||||
|
"sep": 9, "september": 9, "сен": 9, "сент": 9,
|
||||||
|
"oct": 10, "october": 10, "окт": 10, "октя": 10,
|
||||||
|
"nov": 11, "november": 11, "ноя": 11, "ноябр": 11,
|
||||||
|
"dec": 12, "december": 12, "дек": 12, "дека": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
PRESENT_RE = re.compile(r"\b(present|now|current|настоящее время|по н\.в\.|по настоящее)\b", re.I)
|
||||||
|
MMYYYY_RE = re.compile(r"\b(0?[1-9]|1[0-2])[./-](\d{4})\b")
|
||||||
|
YYYY_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
|
||||||
|
MON_YYYY_RE = re.compile(r"\b([A-Za-z]{3,9}|[А-Яа-я]{3,9})\.?\s*(\d{4})\b")
|
||||||
|
RANGE_RE = re.compile(r"(?P<a>.+?)\s*(?:—|–|-|to|по)\s*(?P<b>.+?)$", re.I)
|
||||||
|
YEAR_RANGE_ONLY_RE = re.compile(r"^\s*\d{4}\s*(?:—|–|-|to|по)\s*\d{4}\s*$", re.I)
|
||||||
|
EDU_CONTEXT_RE = re.compile(
|
||||||
|
r"\b("
|
||||||
|
r"education|university|institute|college|academy|school|bachelor|master|degree|faculty|"
|
||||||
|
r"образование|университет|институт|академ|колледж|школа|бакалав|магистр|факультет"
|
||||||
|
r")\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Position:
|
||||||
|
title: Optional[str]
|
||||||
|
company: Optional[str]
|
||||||
|
date_from: Optional[str]
|
||||||
|
date_to: Optional[str]
|
||||||
|
is_current: Optional[bool]
|
||||||
|
description: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_mon(mon: str) -> Optional[int]:
|
||||||
|
m = mon.strip().lower()
|
||||||
|
m = re.sub(r"[^\wа-я]+", "", m, flags=re.I)
|
||||||
|
for k, v in MONTHS.items():
|
||||||
|
if m.startswith(k):
|
||||||
|
return v
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _as_ymd(y: int, m: int) -> date:
|
||||||
|
return date(y, m, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_one_date(s: str) -> Optional[date]:
|
||||||
|
s = s.strip()
|
||||||
|
if PRESENT_RE.search(s):
|
||||||
|
today = date.today()
|
||||||
|
return date(today.year, today.month, 1)
|
||||||
|
m1 = MMYYYY_RE.search(s)
|
||||||
|
if m1:
|
||||||
|
mm = int(m1.group(1))
|
||||||
|
yy = int(m1.group(2))
|
||||||
|
return _as_ymd(yy, mm)
|
||||||
|
m2 = MON_YYYY_RE.search(s)
|
||||||
|
if m2:
|
||||||
|
mon = _parse_mon(m2.group(1))
|
||||||
|
yy = int(m2.group(2))
|
||||||
|
if mon:
|
||||||
|
return _as_ymd(yy, mon)
|
||||||
|
m3 = YYYY_RE.search(s)
|
||||||
|
if m3:
|
||||||
|
yy = int(m3.group(1))
|
||||||
|
return _as_ymd(yy, 1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_positions(text: str, max_items: int = 40) -> List[Position]:
|
||||||
|
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||||||
|
positions: List[Position] = []
|
||||||
|
i = 0
|
||||||
|
while i < len(lines) and len(positions) < max_items:
|
||||||
|
ln = lines[i]
|
||||||
|
if not any(x in ln for x in ("—", "–", "-", " to ", " по ")):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
rr = RANGE_RE.match(ln)
|
||||||
|
if not rr:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
ctx = " ".join(lines[max(0, i - 2): min(len(lines), i + 4)])
|
||||||
|
if YEAR_RANGE_ONLY_RE.match(ln) and EDU_CONTEXT_RE.search(ctx):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
da = _parse_one_date(rr.group("a"))
|
||||||
|
db = _parse_one_date(rr.group("b"))
|
||||||
|
if not da or not db:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
if da.year < 1990:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
is_current = PRESENT_RE.search(rr.group("b")) is not None
|
||||||
|
title = None
|
||||||
|
company = None
|
||||||
|
desc_lines: List[str] = []
|
||||||
|
if i + 1 < len(lines):
|
||||||
|
if EDU_CONTEXT_RE.search(lines[i + 1]):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
header = lines[i + 1]
|
||||||
|
parts = [p.strip() for p in re.split(r"[,|/]", header) if p.strip()]
|
||||||
|
if parts:
|
||||||
|
title = parts[0]
|
||||||
|
if len(parts) > 1:
|
||||||
|
company = parts[1]
|
||||||
|
j = i + 2
|
||||||
|
while j < len(lines):
|
||||||
|
if any(x in lines[j] for x in ("—", "–", "-", " to ", " по ")) and RANGE_RE.match(lines[j]):
|
||||||
|
break
|
||||||
|
desc_lines.append(lines[j])
|
||||||
|
j += 1
|
||||||
|
positions.append(
|
||||||
|
Position(
|
||||||
|
title=title,
|
||||||
|
company=company,
|
||||||
|
date_from=da.isoformat(),
|
||||||
|
date_to=db.isoformat(),
|
||||||
|
is_current=is_current,
|
||||||
|
description="\n".join(desc_lines).strip() if desc_lines else None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
i = j
|
||||||
|
return positions
|
||||||
|
|
||||||
|
|
||||||
|
def positions_to_dicts(items: List[Position]) -> List[dict]:
|
||||||
|
return [asdict(p) for p in items]
|
||||||
585
extract/llm.py
Normal file
585
extract/llm.py
Normal file
@@ -0,0 +1,585 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx # type: ignore
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
httpx = None # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_llm_runtime() -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Resolve OpenAI-compatible runtime config.
|
||||||
|
Supports both generic vars and Mistral aliases:
|
||||||
|
- generic: LLM_BASE_URL / LLM_MODEL / LLM_API_KEY
|
||||||
|
- mistral: MISTRAL_API_KEY / MISTRAL_MODEL / MISTRAL_BASE_URL
|
||||||
|
"""
|
||||||
|
provider = (os.environ.get("LLM_PROVIDER") or "").strip().lower()
|
||||||
|
base_url = (os.environ.get("LLM_BASE_URL") or "").strip()
|
||||||
|
model = (os.environ.get("LLM_MODEL") or "").strip()
|
||||||
|
api_key = (os.environ.get("LLM_API_KEY") or "").strip()
|
||||||
|
|
||||||
|
mistral_key = (os.environ.get("MISTRAL_API_KEY") or "").strip()
|
||||||
|
mistral_model = (os.environ.get("MISTRAL_MODEL") or "").strip()
|
||||||
|
mistral_base = (os.environ.get("MISTRAL_BASE_URL") or "https://api.mistral.ai/v1").strip()
|
||||||
|
|
||||||
|
if not api_key and mistral_key:
|
||||||
|
api_key = mistral_key
|
||||||
|
if not model and mistral_model:
|
||||||
|
model = mistral_model
|
||||||
|
if not base_url and (mistral_key or mistral_model or provider == "mistral" or os.environ.get("MISTRAL_BASE_URL")):
|
||||||
|
base_url = mistral_base
|
||||||
|
|
||||||
|
if base_url:
|
||||||
|
base_url = base_url.rstrip("/")
|
||||||
|
|
||||||
|
if not provider:
|
||||||
|
if "mistral.ai" in base_url or (model and model.lower().startswith("mistral")):
|
||||||
|
provider = "mistral"
|
||||||
|
else:
|
||||||
|
provider = "generic"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"provider": provider,
|
||||||
|
"base_url": base_url,
|
||||||
|
"model": model,
|
||||||
|
"api_key": api_key,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ------------- Public API -------------
|
||||||
|
|
||||||
|
def llm_parse_enabled() -> bool:
|
||||||
|
"""
|
||||||
|
Enabled only if httpx is available and both base_url/model are resolved.
|
||||||
|
Opt-out via LLM_PARSE_ENABLED=0.
|
||||||
|
"""
|
||||||
|
if httpx is None:
|
||||||
|
return False
|
||||||
|
if os.environ.get("LLM_PARSE_ENABLED", "1").lower() in ("0", "false", "no"):
|
||||||
|
return False
|
||||||
|
runtime = resolve_llm_runtime()
|
||||||
|
return bool(runtime["base_url"]) and bool(runtime["model"])
|
||||||
|
|
||||||
|
|
||||||
|
_PROMPT_VERSION = "v3_sections_doc_type"
|
||||||
|
_REVIEW_PROMPT_VERSION = "v1_review_merge"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LLMExtraction:
|
||||||
|
roles: List[str]
|
||||||
|
skills: List[str]
|
||||||
|
primary_languages: List[str]
|
||||||
|
seniority: Optional[str]
|
||||||
|
backend_focus: Optional[bool]
|
||||||
|
experience_years_total: Optional[float]
|
||||||
|
experience_years_engineering: Optional[float]
|
||||||
|
english_level: Optional[str]
|
||||||
|
location: Optional[str]
|
||||||
|
remote_ok: Optional[bool]
|
||||||
|
salary_min_usd: Optional[int]
|
||||||
|
salary_max_usd: Optional[int]
|
||||||
|
salary_min_rub: Optional[int]
|
||||||
|
salary_max_rub: Optional[int]
|
||||||
|
highlights: List[str]
|
||||||
|
keywords: List[str]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_obj(obj: Dict[str, Any]) -> "LLMExtraction":
|
||||||
|
def _as_list(v: Any) -> List[str]:
|
||||||
|
if v is None:
|
||||||
|
return []
|
||||||
|
if isinstance(v, list):
|
||||||
|
return [str(x).strip() for x in v if str(x).strip()]
|
||||||
|
s = str(v).strip()
|
||||||
|
return [s] if s else []
|
||||||
|
|
||||||
|
def _as_float(v: Any) -> Optional[float]:
|
||||||
|
try:
|
||||||
|
return float(v)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _as_int(v: Any) -> Optional[int]:
|
||||||
|
try:
|
||||||
|
return int(float(v))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _as_bool(v: Any) -> Optional[bool]:
|
||||||
|
if isinstance(v, bool):
|
||||||
|
return v
|
||||||
|
if v is None:
|
||||||
|
return None
|
||||||
|
s = str(v).strip().lower()
|
||||||
|
if s in ("true", "1", "yes", "y"):
|
||||||
|
return True
|
||||||
|
if s in ("false", "0", "no", "n"):
|
||||||
|
return False
|
||||||
|
return None
|
||||||
|
|
||||||
|
return LLMExtraction(
|
||||||
|
roles=_as_list(obj.get("roles")),
|
||||||
|
skills=_as_list(obj.get("skills")),
|
||||||
|
primary_languages=_as_list(obj.get("primary_languages")),
|
||||||
|
seniority=(str(obj.get("seniority")).strip().lower() or None) if obj.get("seniority") else None,
|
||||||
|
backend_focus=_as_bool(obj.get("backend_focus")),
|
||||||
|
experience_years_total=_as_float(obj.get("experience_years_total")),
|
||||||
|
experience_years_engineering=_as_float(obj.get("experience_years_engineering")),
|
||||||
|
english_level=(str(obj.get("english_level")).strip().upper() or None) if obj.get("english_level") else None,
|
||||||
|
location=(str(obj.get("location")).strip() or None) if obj.get("location") else None,
|
||||||
|
remote_ok=_as_bool(obj.get("remote_ok")),
|
||||||
|
salary_min_usd=_as_int(obj.get("salary_min_usd")),
|
||||||
|
salary_max_usd=_as_int(obj.get("salary_max_usd")),
|
||||||
|
salary_min_rub=_as_int(obj.get("salary_min_rub")),
|
||||||
|
salary_max_rub=_as_int(obj.get("salary_max_rub")),
|
||||||
|
highlights=_as_list(obj.get("highlights")),
|
||||||
|
keywords=_as_list(obj.get("keywords")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def llm_extract_profile(
|
||||||
|
clean_text: str,
|
||||||
|
*,
|
||||||
|
con: Optional[sqlite3.Connection] = None,
|
||||||
|
doc_type: Optional[str] = None,
|
||||||
|
sections: Optional[Dict[str, str]] = None,
|
||||||
|
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Returns (LLMExtraction | None, debug_info).
|
||||||
|
- Uses cache on disk/sqlite to keep throughput high.
|
||||||
|
- Silently degrades to None on any failure.
|
||||||
|
"""
|
||||||
|
runtime = resolve_llm_runtime()
|
||||||
|
dbg: Dict[str, Any] = {
|
||||||
|
"enabled": llm_parse_enabled(),
|
||||||
|
"provider": runtime.get("provider"),
|
||||||
|
"model": runtime.get("model"),
|
||||||
|
"from_cache": False,
|
||||||
|
"cache_backend": None,
|
||||||
|
"error": None,
|
||||||
|
"prompt_version": _PROMPT_VERSION,
|
||||||
|
}
|
||||||
|
if not llm_parse_enabled():
|
||||||
|
return None, dbg
|
||||||
|
|
||||||
|
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
|
||||||
|
cache_key = f"extract:{text_hash}:{runtime['model']}:{_PROMPT_VERSION}"
|
||||||
|
|
||||||
|
payload = _build_payload(
|
||||||
|
clean_text,
|
||||||
|
doc_type=doc_type,
|
||||||
|
sections=sections,
|
||||||
|
prompt_version=_PROMPT_VERSION,
|
||||||
|
temperature=float(os.environ.get("LLM_PARSE_TEMPERATURE", 0.1)),
|
||||||
|
max_tokens=int(os.environ.get("LLM_PARSE_MAX_TOKENS", 700)),
|
||||||
|
system_prompt="You output ONLY JSON for structured resume extraction.",
|
||||||
|
prompt_template=_PROMPT_TEMPLATE,
|
||||||
|
)
|
||||||
|
|
||||||
|
data = _cached_llm_json_call(
|
||||||
|
con=con,
|
||||||
|
cache_key=cache_key,
|
||||||
|
model=runtime["model"],
|
||||||
|
payload=payload,
|
||||||
|
dbg=dbg,
|
||||||
|
)
|
||||||
|
if data is None:
|
||||||
|
return None, dbg
|
||||||
|
return LLMExtraction.from_obj(data), dbg
|
||||||
|
|
||||||
|
|
||||||
|
def llm_review_profile(
|
||||||
|
clean_text: str,
|
||||||
|
*,
|
||||||
|
draft: Dict[str, Any],
|
||||||
|
con: Optional[sqlite3.Connection] = None,
|
||||||
|
doc_type: Optional[str] = None,
|
||||||
|
sections: Optional[Dict[str, str]] = None,
|
||||||
|
) -> Tuple[Optional[LLMExtraction], Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Second-pass validator:
|
||||||
|
- Takes already parsed JSON (draft)
|
||||||
|
- Re-checks every field against resume text
|
||||||
|
- Returns corrected extraction for safe merge in pipeline
|
||||||
|
"""
|
||||||
|
runtime = resolve_llm_runtime()
|
||||||
|
dbg: Dict[str, Any] = {
|
||||||
|
"enabled": llm_parse_enabled(),
|
||||||
|
"provider": runtime.get("provider"),
|
||||||
|
"model": runtime.get("model"),
|
||||||
|
"from_cache": False,
|
||||||
|
"cache_backend": None,
|
||||||
|
"error": None,
|
||||||
|
"prompt_version": _REVIEW_PROMPT_VERSION,
|
||||||
|
"quality_score": None,
|
||||||
|
"changed_fields": [],
|
||||||
|
"issues_found": [],
|
||||||
|
}
|
||||||
|
if not llm_parse_enabled():
|
||||||
|
return None, dbg
|
||||||
|
|
||||||
|
clean_draft = _sanitize_review_draft(draft)
|
||||||
|
draft_blob = json.dumps(clean_draft, ensure_ascii=False, sort_keys=True)
|
||||||
|
text_hash = hashlib.sha1(clean_text.encode("utf-8", errors="ignore")).hexdigest()
|
||||||
|
draft_hash = hashlib.sha1(draft_blob.encode("utf-8", errors="ignore")).hexdigest()
|
||||||
|
cache_key = f"review:{text_hash}:{draft_hash}:{runtime['model']}:{_REVIEW_PROMPT_VERSION}"
|
||||||
|
|
||||||
|
payload = _build_payload(
|
||||||
|
clean_text,
|
||||||
|
doc_type=doc_type,
|
||||||
|
sections=sections,
|
||||||
|
prompt_version=_REVIEW_PROMPT_VERSION,
|
||||||
|
temperature=float(os.environ.get("LLM_REVIEW_TEMPERATURE", 0.0)),
|
||||||
|
max_tokens=int(os.environ.get("LLM_REVIEW_MAX_TOKENS", 850)),
|
||||||
|
system_prompt="You output ONLY JSON for resume parsing quality review.",
|
||||||
|
prompt_template=_REVIEW_PROMPT_TEMPLATE,
|
||||||
|
extra_vars={"draft_json": draft_blob},
|
||||||
|
)
|
||||||
|
|
||||||
|
data = _cached_llm_json_call(
|
||||||
|
con=con,
|
||||||
|
cache_key=cache_key,
|
||||||
|
model=runtime["model"],
|
||||||
|
payload=payload,
|
||||||
|
dbg=dbg,
|
||||||
|
)
|
||||||
|
if data is None:
|
||||||
|
return None, dbg
|
||||||
|
|
||||||
|
corrected_obj: Dict[str, Any]
|
||||||
|
if isinstance(data.get("corrected"), dict):
|
||||||
|
corrected_obj = data["corrected"]
|
||||||
|
else:
|
||||||
|
corrected_obj = data
|
||||||
|
|
||||||
|
dbg["quality_score"] = _as_float(data.get("quality_score"))
|
||||||
|
dbg["changed_fields"] = _as_str_list(data.get("changed_fields"))
|
||||||
|
dbg["issues_found"] = _as_str_list(data.get("issues_found"))
|
||||||
|
|
||||||
|
return LLMExtraction.from_obj(corrected_obj), dbg
|
||||||
|
|
||||||
|
|
||||||
|
# ------------- Internal helpers -------------
|
||||||
|
|
||||||
|
_PROMPT_TEMPLATE = """
|
||||||
|
Ты - ассистент, который структурирует резюме разработчиков. Отвечай ТОЛЬКО JSON.
|
||||||
|
Используй только факты из текста, ничего не придумывай. Если данных нет - ставь null или пустой список.
|
||||||
|
Схема:
|
||||||
|
{{
|
||||||
|
"roles": ["backend","devops","frontend","qa","data engineer","android","ios"],
|
||||||
|
"skills": ["python","go","k8s","postgres","react", "..."],
|
||||||
|
"primary_languages": ["python","go","java","c++", "..."],
|
||||||
|
"seniority": "intern|junior|middle|senior|lead|principal|null",
|
||||||
|
"backend_focus": true|false|null,
|
||||||
|
"experience_years_total": number|null,
|
||||||
|
"experience_years_engineering": number|null,
|
||||||
|
"english_level": "A1|A2|B1|B2|C1|C2|null",
|
||||||
|
"location": "city, country|null",
|
||||||
|
"remote_ok": true|false|null,
|
||||||
|
"salary_min_usd": int|null,
|
||||||
|
"salary_max_usd": int|null,
|
||||||
|
"salary_min_rub": int|null,
|
||||||
|
"salary_max_rub": int|null,
|
||||||
|
"highlights": ["кратко достижения (1-2 предложения)"],
|
||||||
|
"keywords": ["уникальные ключевые слова, продукты или домены"]
|
||||||
|
}}
|
||||||
|
Не включай контактные данные в skills/keywords.
|
||||||
|
Detected doc_type: {doc_type}
|
||||||
|
Sections (if present):
|
||||||
|
{sections_block}
|
||||||
|
|
||||||
|
Full text snippet (use only if needed):
|
||||||
|
```TEXT
|
||||||
|
{resume_text}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
_REVIEW_PROMPT_TEMPLATE = """
|
||||||
|
Ты валидатор качества парсинга резюме разработчиков. Отвечай ТОЛЬКО JSON.
|
||||||
|
У тебя есть черновой JSON после эвристик/первичного парсинга. Нужно перепроверить каждое поле по тексту резюме.
|
||||||
|
Исправляй только то, что прямо подтверждается текстом. Нельзя выдумывать.
|
||||||
|
|
||||||
|
Верни JSON строго такой формы:
|
||||||
|
{{
|
||||||
|
"corrected": {{
|
||||||
|
"roles": ["..."],
|
||||||
|
"skills": ["..."],
|
||||||
|
"primary_languages": ["..."],
|
||||||
|
"seniority": "intern|junior|middle|senior|lead|principal|null",
|
||||||
|
"backend_focus": true|false|null,
|
||||||
|
"experience_years_total": number|null,
|
||||||
|
"experience_years_engineering": number|null,
|
||||||
|
"english_level": "A1|A2|B1|B2|C1|C2|null",
|
||||||
|
"location": "city, country|null",
|
||||||
|
"remote_ok": true|false|null,
|
||||||
|
"salary_min_usd": int|null,
|
||||||
|
"salary_max_usd": int|null,
|
||||||
|
"salary_min_rub": int|null,
|
||||||
|
"salary_max_rub": int|null,
|
||||||
|
"highlights": ["..."],
|
||||||
|
"keywords": ["..."]
|
||||||
|
}},
|
||||||
|
"changed_fields": ["field_name", "..."],
|
||||||
|
"issues_found": ["кратко что было неверно/сомнительно", "..."],
|
||||||
|
"quality_score": 0.0
|
||||||
|
}}
|
||||||
|
|
||||||
|
Черновик JSON:
|
||||||
|
```DRAFT
|
||||||
|
{draft_json}
|
||||||
|
```
|
||||||
|
|
||||||
|
Detected doc_type: {doc_type}
|
||||||
|
Sections (if present):
|
||||||
|
{sections_block}
|
||||||
|
|
||||||
|
Full text snippet (use only if needed):
|
||||||
|
```TEXT
|
||||||
|
{resume_text}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _trim_text(text: str, max_len: int = 9000) -> str:
|
||||||
|
"""
|
||||||
|
Keep head and tail to preserve summary + recent projects.
|
||||||
|
"""
|
||||||
|
if len(text) <= max_len:
|
||||||
|
return text
|
||||||
|
head = text[: max_len // 2]
|
||||||
|
tail = text[-max_len // 2 :]
|
||||||
|
return head + "\n...\n" + tail
|
||||||
|
|
||||||
|
|
||||||
|
def _build_payload(
|
||||||
|
clean_text: str,
|
||||||
|
*,
|
||||||
|
doc_type: Optional[str],
|
||||||
|
sections: Optional[Dict[str, str]],
|
||||||
|
prompt_version: str,
|
||||||
|
temperature: float,
|
||||||
|
max_tokens: int,
|
||||||
|
system_prompt: str,
|
||||||
|
prompt_template: str,
|
||||||
|
extra_vars: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
runtime = resolve_llm_runtime()
|
||||||
|
base_url = runtime["base_url"]
|
||||||
|
model = runtime["model"]
|
||||||
|
|
||||||
|
sections_block = _build_sections_block(sections)
|
||||||
|
tpl_vars = {
|
||||||
|
"resume_text": _trim_text(clean_text),
|
||||||
|
"doc_type": (doc_type or "unknown"),
|
||||||
|
"sections_block": sections_block or "(no sections detected)",
|
||||||
|
}
|
||||||
|
if extra_vars:
|
||||||
|
tpl_vars.update(extra_vars)
|
||||||
|
|
||||||
|
prompt = prompt_template.format(**tpl_vars)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"base_url": base_url,
|
||||||
|
"model": model,
|
||||||
|
"prompt_version": prompt_version,
|
||||||
|
"payload": {
|
||||||
|
"model": model,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
"temperature": temperature,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
},
|
||||||
|
"headers": _build_headers(runtime),
|
||||||
|
"timeout": float(os.environ.get("LLM_PARSE_TIMEOUT", 18.0)),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_headers(runtime: Dict[str, str]) -> Dict[str, str]:
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
api_key = runtime.get("api_key", "")
|
||||||
|
if api_key:
|
||||||
|
headers["Authorization"] = f"Bearer {api_key}"
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def _cached_llm_json_call(
|
||||||
|
*,
|
||||||
|
con: Optional[sqlite3.Connection],
|
||||||
|
cache_key: str,
|
||||||
|
model: str,
|
||||||
|
payload: Dict[str, Any],
|
||||||
|
dbg: Dict[str, Any],
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
data = _cache_get_sqlite(con, cache_key)
|
||||||
|
if data:
|
||||||
|
dbg["from_cache"] = True
|
||||||
|
dbg["cache_backend"] = "sqlite"
|
||||||
|
return data
|
||||||
|
|
||||||
|
cache_dir = Path(os.environ.get("LLM_PARSE_CACHE", ".cache/llm_parse")).resolve()
|
||||||
|
cache_ok = True
|
||||||
|
try:
|
||||||
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
except Exception:
|
||||||
|
cache_ok = False
|
||||||
|
|
||||||
|
safe_name = cache_key.replace(":", "_")
|
||||||
|
cache_path = (cache_dir / f"{safe_name}.json") if cache_ok else None
|
||||||
|
|
||||||
|
if cache_path and cache_path.exists():
|
||||||
|
try:
|
||||||
|
data = json.loads(cache_path.read_text(encoding="utf-8"))
|
||||||
|
dbg["from_cache"] = True
|
||||||
|
dbg["cache_backend"] = "disk"
|
||||||
|
return data
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = _llm_call_json(payload)
|
||||||
|
if con:
|
||||||
|
_cache_put_sqlite(con, cache_key, model, data)
|
||||||
|
if cache_path:
|
||||||
|
cache_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
|
||||||
|
return data
|
||||||
|
except Exception as e: # pragma: no cover - network/LLM failures
|
||||||
|
dbg["error"] = repr(e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _llm_call_json(task: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
if httpx is None:
|
||||||
|
raise RuntimeError("httpx is not installed")
|
||||||
|
|
||||||
|
base_url: str = task["base_url"]
|
||||||
|
payload: Dict[str, Any] = task["payload"]
|
||||||
|
timeout = float(task.get("timeout", 18.0))
|
||||||
|
|
||||||
|
with httpx.Client(timeout=timeout) as client:
|
||||||
|
r = client.post(f"{base_url}/chat/completions", headers=task["headers"], json=payload)
|
||||||
|
r.raise_for_status()
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
content = data["choices"][0]["message"]["content"]
|
||||||
|
if isinstance(content, list):
|
||||||
|
parts = []
|
||||||
|
for block in content:
|
||||||
|
if isinstance(block, dict):
|
||||||
|
parts.append(str(block.get("text") or ""))
|
||||||
|
else:
|
||||||
|
parts.append(str(block))
|
||||||
|
content = "\n".join(parts)
|
||||||
|
content = str(content)
|
||||||
|
|
||||||
|
m = re.search(r"\{.*\}", content, flags=re.S)
|
||||||
|
if not m:
|
||||||
|
raise ValueError("LLM did not return JSON")
|
||||||
|
return json.loads(m.group(0))
|
||||||
|
|
||||||
|
|
||||||
|
def _build_sections_block(sections: Optional[Dict[str, str]]) -> str:
|
||||||
|
if not sections:
|
||||||
|
return ""
|
||||||
|
parts: List[str] = []
|
||||||
|
order = [
|
||||||
|
("about", "ABOUT"),
|
||||||
|
("skills", "SKILLS"),
|
||||||
|
("experience", "EXPERIENCE"),
|
||||||
|
("education", "EDUCATION"),
|
||||||
|
("contacts", "CONTACTS"),
|
||||||
|
]
|
||||||
|
for key, label in order:
|
||||||
|
text = sections.get(key)
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
snippet = _trim_text(text, max_len=1800)
|
||||||
|
parts.append(f"[{label}]\n{snippet}")
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_review_draft(draft: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
if not isinstance(draft, dict):
|
||||||
|
draft = {}
|
||||||
|
|
||||||
|
allowed = {
|
||||||
|
"roles",
|
||||||
|
"skills",
|
||||||
|
"primary_languages",
|
||||||
|
"seniority",
|
||||||
|
"backend_focus",
|
||||||
|
"experience_years_total",
|
||||||
|
"experience_years_engineering",
|
||||||
|
"english_level",
|
||||||
|
"location",
|
||||||
|
"remote_ok",
|
||||||
|
"salary_min_usd",
|
||||||
|
"salary_max_usd",
|
||||||
|
"salary_min_rub",
|
||||||
|
"salary_max_rub",
|
||||||
|
"highlights",
|
||||||
|
"keywords",
|
||||||
|
}
|
||||||
|
cleaned = {k: v for k, v in draft.items() if k in allowed}
|
||||||
|
return asdict(LLMExtraction.from_obj(cleaned))
|
||||||
|
|
||||||
|
|
||||||
|
def _as_float(v: Any) -> Optional[float]:
|
||||||
|
try:
|
||||||
|
x = float(v)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
if x < 0:
|
||||||
|
return None
|
||||||
|
if x > 1.0:
|
||||||
|
return 1.0
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def _as_str_list(v: Any) -> List[str]:
|
||||||
|
if v is None:
|
||||||
|
return []
|
||||||
|
if isinstance(v, list):
|
||||||
|
return [str(x).strip() for x in v if str(x).strip()]
|
||||||
|
s = str(v).strip()
|
||||||
|
return [s] if s else []
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_get_sqlite(con: Optional[sqlite3.Connection], cache_key: str) -> Optional[Dict[str, Any]]:
|
||||||
|
if con is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
row = con.execute("SELECT result_json FROM llm_cache WHERE cache_key=?", (cache_key,)).fetchone()
|
||||||
|
if row and row["result_json"]:
|
||||||
|
return json.loads(row["result_json"])
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_put_sqlite(
|
||||||
|
con: Optional[sqlite3.Connection],
|
||||||
|
cache_key: str,
|
||||||
|
model: str,
|
||||||
|
data: Dict[str, Any],
|
||||||
|
) -> None:
|
||||||
|
if con is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
con.execute(
|
||||||
|
"INSERT OR REPLACE INTO llm_cache(cache_key, model, result_json) VALUES (?,?,?)",
|
||||||
|
(cache_key, model, json.dumps(data, ensure_ascii=False)),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
659
extract/parse.py
Normal file
659
extract/parse.py
Normal file
@@ -0,0 +1,659 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
from tg_resume_db.normalize import normalize_skill
|
||||||
|
from tg_resume_db.extract.experience import extract_experience
|
||||||
|
|
||||||
|
EMAIL_RE = re.compile(r"\b[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}\b", re.I)
|
||||||
|
EMAIL_SPLIT_RE = re.compile(
|
||||||
|
r"(?<![@\w])(?P<prefix>[a-z0-9][a-z0-9._%+\-]{1,40})\s+"
|
||||||
|
r"(?P<tail>[a-z0-9][a-z0-9._%+\-]{0,40}@[a-z0-9.\-]+\.[a-z]{2,})",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
PHONE_RE = re.compile(r"(?<!\w)(\+?\d[\d\-\s().]{7,}\d)(?!\w)")
|
||||||
|
TG_RE = re.compile(r"(?:t\.me/|@)([a-z0-9_]{4,32})", re.I)
|
||||||
|
GITHUB_RE = re.compile(r"github\.com/([A-Za-z0-9\-]+)", re.I)
|
||||||
|
LINKEDIN_RE = re.compile(r"linkedin\.com/in/([A-Za-z0-9\-_]+)", re.I)
|
||||||
|
URL_RE = re.compile(r"\bhttps?://[^\s)]+", re.I)
|
||||||
|
|
||||||
|
EN_RE = re.compile(r"\b(A1\+?|A2\+?|B1\+?|B2\+?|C1\+?|C2\+?)\b", re.I)
|
||||||
|
EN_TEXT_RE = re.compile(
|
||||||
|
r"\b(native|fluent|proficient|advanced|upper\s*intermediate|intermediate|elementary)\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
EN_LANG_RE = re.compile(r"\b(english|англий)\b", re.I)
|
||||||
|
|
||||||
|
REMOTE_RE = re.compile(
|
||||||
|
r"\b("
|
||||||
|
r"full[\s\-]?remote|remote[\s\-]?(work|position|job|only)|open to remote|remote first|"
|
||||||
|
r"удален\w*|удалён\w*|дистанцион\w*|home office|relocation not needed"
|
||||||
|
r")\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Salary (rough)
|
||||||
|
CURRENCY_RE = re.compile(r"(?:\b(?:руб|rub|usd|eur)\b|[₽$€])", re.I)
|
||||||
|
NUM_RE = re.compile(r"(?:(\d{2,3})\s*(k|к)\b)|(\d{2,3}\s*000)|(\d{4,7})", re.I)
|
||||||
|
SALARY_HINT_RE = re.compile(
|
||||||
|
r"\b("
|
||||||
|
r"salary|compensation|rate|expected salary|desired salary|salary expectation|income|"
|
||||||
|
r"зарплат\w*|доход|оклад|вознагражден\w*|заработ\w*|expectations"
|
||||||
|
r")\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
PAY_TOKEN_RE = re.compile(
|
||||||
|
r"([€$₽]|\b(?:usd|eur|rub|руб)\b).{0,14}\d|\d.{0,14}([€$₽]|\b(?:usd|eur|rub|руб)\b)",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
SALARY_NOISE_RE = re.compile(
|
||||||
|
r"\b(users?|employees?|people|domains?|cities?|objects?|stores?|requests?|transactions?|"
|
||||||
|
r"companies?|followers?|downloads?|clients?)\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
SECTION_HEADER_RE = re.compile(
|
||||||
|
r"^\s*(contacts?|contact info|about|summary|skills?|experience|work experience|education|languages?|projects?)\s*$",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
LOCATION_CITY_COUNTRY_RE = re.compile(
|
||||||
|
r"^[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40},\s*[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}$"
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- SKILLS & ROLES ---
|
||||||
|
|
||||||
|
SKILLS = {
|
||||||
|
"python","go","golang","java","kotlin","c#","c++","cpp","javascript","typescript","node","nodejs","react","vue","angular",
|
||||||
|
"sql","postgres","postgresql","mysql","mssql","redis","kafka","rabbitmq","docker","k8s","kubernetes","helm","terraform",
|
||||||
|
"aws","gcp","azure","linux","nginx","grpc","rest","graphql","spark","airflow","fastapi","django","flask","spring","dotnet",
|
||||||
|
"pytest","selenium","playwright","ci/cd","gitlab","github actions","prometheus","grafana"
|
||||||
|
}
|
||||||
|
|
||||||
|
_SKILL_ALIASES: Dict[str, List[str]] = {
|
||||||
|
"javascript": ["java script", "java-script", "js"],
|
||||||
|
"typescript": ["type script", "type-script", "ts"],
|
||||||
|
"postgresql": ["postgres", "postgre sql", "postgre-sql"],
|
||||||
|
"graphql": ["graph ql"],
|
||||||
|
"grpc": ["g rpc"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_skill_patterns() -> List[Tuple[str, re.Pattern]]:
|
||||||
|
patterns: List[Tuple[str, re.Pattern]] = []
|
||||||
|
for skill in sorted(SKILLS):
|
||||||
|
aliases = [skill] + _SKILL_ALIASES.get(skill, [])
|
||||||
|
for alias in aliases:
|
||||||
|
if skill == "java" and alias == "java":
|
||||||
|
# Do not match "java" inside "java script".
|
||||||
|
pat = re.compile(r"(?<![a-z0-9+#])java(?!\s*script)(?![a-z0-9+#])", re.I)
|
||||||
|
else:
|
||||||
|
pat = re.compile(r"(?<![a-z0-9+#])" + re.escape(alias) + r"(?![a-z0-9+#])", re.I)
|
||||||
|
patterns.append((skill, pat))
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
|
||||||
|
_SKILL_PATTERNS = _build_skill_patterns()
|
||||||
|
|
||||||
|
ROLES = {
|
||||||
|
"backend","frontend","fullstack","devops","qa","sre","data engineer","data scientist","ml engineer",
|
||||||
|
"mobile","android","ios","team lead","tech lead","architect"
|
||||||
|
}
|
||||||
|
|
||||||
|
_ROLE_ALIASES: Dict[str, List[str]] = {
|
||||||
|
"backend": ["backend", "backend developer", "backend engineer", "бэкенд", "бекенд"],
|
||||||
|
"frontend": ["frontend", "frontend developer", "frontend engineer", "фронтенд", "фронт"],
|
||||||
|
"fullstack": ["fullstack", "full stack", "full-stack", "фулстек"],
|
||||||
|
"devops": ["devops", "dev ops", "platform engineer", "infrastructure engineer"],
|
||||||
|
"qa": ["qa", "quality assurance", "tester", "test engineer", "test automation", "manual qa"],
|
||||||
|
"sre": ["sre", "site reliability"],
|
||||||
|
"data engineer": ["data engineer"],
|
||||||
|
"data scientist": ["data scientist"],
|
||||||
|
"ml engineer": ["ml engineer", "machine learning engineer"],
|
||||||
|
"mobile": ["mobile developer", "mobile engineer"],
|
||||||
|
"android": ["android developer", "android engineer"],
|
||||||
|
"ios": ["ios developer", "ios engineer"],
|
||||||
|
"team lead": ["team lead", "teamlead"],
|
||||||
|
"tech lead": ["tech lead", "techlead"],
|
||||||
|
"architect": ["architect", "solution architect", "software architect"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_role_patterns() -> Dict[str, List[re.Pattern]]:
|
||||||
|
out: Dict[str, List[re.Pattern]] = {}
|
||||||
|
for role in ROLES:
|
||||||
|
aliases = _ROLE_ALIASES.get(role, [role])
|
||||||
|
out[role] = [
|
||||||
|
re.compile(r"(?<![a-z0-9+#])" + re.escape(a) + r"(?![a-z0-9+#])", re.I)
|
||||||
|
for a in aliases
|
||||||
|
]
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
_ROLE_PATTERNS = _build_role_patterns()
|
||||||
|
|
||||||
|
# --- HR / RECRUITER FILTERS ---
|
||||||
|
# Words that indicate the line is about searching for candidates, not owning the skill.
|
||||||
|
HR_CONTEXT_RE = re.compile(
|
||||||
|
r"\b(hiring|recruitment|recruiter|sourc(ing|er)|talent|acquisition|vacancy|vacancies|candidate|staffing|headhunt)\b|"
|
||||||
|
r"\b(подбор|поиск|найм|закры(ла|л|тие)|ваканси|резюме|сорс(инг|ер)|рекрут|персонал|кадр(ы|ов)|hr)\b",
|
||||||
|
re.I
|
||||||
|
)
|
||||||
|
|
||||||
|
# Roles that explicitly define the person as Non-Engineering
|
||||||
|
NON_TECH_ROLES_RE = re.compile(
|
||||||
|
r"\b(recruiter|hr|talent|manager|generalist|human resources|head of recruitment|рекрутер|менеджер по персоналу|эйчар)\b",
|
||||||
|
re.I
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- EXPERIENCE ---
|
||||||
|
|
||||||
|
AGE_LINE_RE = re.compile(
|
||||||
|
r"(?i)\b(мужчина|женщина|родил[а-я]*|возраст|years?\s+old)\b"
|
||||||
|
)
|
||||||
|
|
||||||
|
EXP_HEADER_RE = re.compile(
|
||||||
|
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
|
||||||
|
)
|
||||||
|
|
||||||
|
# "5 years 10 months"
|
||||||
|
EXP_SUMMARY_RE = re.compile(
|
||||||
|
r"(?i)\b(опыт\s+работы|стаж(\s+работы)?|work\s+experience|experience)\b"
|
||||||
|
r"[^0-9]{0,20}"
|
||||||
|
r"(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
|
||||||
|
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
|
||||||
|
)
|
||||||
|
|
||||||
|
EXP_NEARBY_RE = re.compile(
|
||||||
|
r"(?i)\b(?P<y>\d{1,2})\s*(?:год|года|лет|years?|yrs?)"
|
||||||
|
r"(?:[^0-9]{0,20}(?P<m>\d{1,2})\s*(?:мес|месяц|месяца|месяцев|months?))?"
|
||||||
|
)
|
||||||
|
|
||||||
|
HH_FOOTER_RE = re.compile(
|
||||||
|
r"(?P<name>[A-Za-zА-ЯЁ][A-Za-zА-Яа-яЁё'\-\s]{2,80})\s*[•|]\s*резюме\s+обновлено",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
NAME_KV_RE = re.compile(r"^\s*(name|имя)\s*[:\-]\s*(.+)$", re.I)
|
||||||
|
NAME_LINE_RE = re.compile(
|
||||||
|
r"^[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+(?:\s+[A-ZА-ЯЁ][A-Za-zА-Яа-яЁё'\-]+){1,3}$"
|
||||||
|
)
|
||||||
|
NAME_STOPWORDS = {
|
||||||
|
"resume", "cv", "contacts", "contact", "summary", "skills", "experience", "education",
|
||||||
|
"projects", "about", "profile", "objective", "навыки", "опыт", "образование",
|
||||||
|
"контакты", "профиль", "цель", "резюме",
|
||||||
|
"developer", "engineer", "backend", "frontend", "fullstack", "team lead", "tech lead",
|
||||||
|
"backend developer", "frontend developer", "fullstack developer", "software engineer",
|
||||||
|
"разработчик", "инженер", "бэкенд", "фронтенд", "фулстек", "тимлид", "техлид",
|
||||||
|
"top skills", "experience", "education", "languages", "certifications",
|
||||||
|
"skills & endorsements", "endorsements",
|
||||||
|
"university", "state university", "institute", "college", "academy", "school",
|
||||||
|
"bachelor", "master", "degree", "faculty", "университет", "институт", "академия",
|
||||||
|
"колледж", "школа", "бакалавр", "магистр", "факультет",
|
||||||
|
}
|
||||||
|
|
||||||
|
_NAME_BAD_WORDS = {
|
||||||
|
"skills", "top skills", "experience", "education", "languages", "certifications",
|
||||||
|
"projects", "summary", "about", "profile", "endorsements",
|
||||||
|
"university", "institute", "college", "academy", "school",
|
||||||
|
"bachelor", "master", "degree", "faculty",
|
||||||
|
}
|
||||||
|
|
||||||
|
NAME_INSTITUTION_RE = re.compile(
|
||||||
|
r"\b("
|
||||||
|
r"university|institute|college|academy|school|faculty|bachelor|master|degree|"
|
||||||
|
r"mathematics|computer science|informatics|physics|economics|management|"
|
||||||
|
r"университет|институт|академ|колледж|школа|факультет|бакалав|магистр|"
|
||||||
|
r"математик|информатик|физик|экономик|менеджмент"
|
||||||
|
r")\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
_EMAIL_PREFIX_STOP = {
|
||||||
|
"email", "mail", "contact", "contacts", "phone", "tel", "telegram", "linkedin", "github",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _prune_fragment_emails(values: List[str]) -> List[str]:
|
||||||
|
uniq = sorted(set(v.lower().strip() for v in values if v and "@" in v))
|
||||||
|
out: List[str] = []
|
||||||
|
for e in uniq:
|
||||||
|
local, domain = e.split("@", 1)
|
||||||
|
drop = False
|
||||||
|
for other in uniq:
|
||||||
|
if other == e:
|
||||||
|
continue
|
||||||
|
ol, od = other.split("@", 1)
|
||||||
|
if od != domain:
|
||||||
|
continue
|
||||||
|
if len(local) <= 8 and len(ol) > len(local) + 2 and ol.endswith(local) and re.search(r"[._\-]", ol):
|
||||||
|
drop = True
|
||||||
|
break
|
||||||
|
if not drop:
|
||||||
|
out.append(e)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_experience_years(text: str) -> Tuple[Optional[float], Optional[float], float, Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Returns (total_years, engineering_years, confidence, debug).
|
||||||
|
|
||||||
|
Logic:
|
||||||
|
1. Calculate TOTAL experience from summaries.
|
||||||
|
2. Check if the candidate is primarily a Recruiter/HR.
|
||||||
|
- If YES: engineering_years = 0.0 (prevents recruiters from showing up as Senior Devs).
|
||||||
|
- If NO: engineering_years = total_years (Optimistic assumption for valid devs).
|
||||||
|
"""
|
||||||
|
dbg: Dict[str, Any] = {"method": None, "matched": None, "is_recruiter": False}
|
||||||
|
|
||||||
|
total_years: Optional[float] = None
|
||||||
|
confidence = 0.0
|
||||||
|
|
||||||
|
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||||||
|
|
||||||
|
# 1. Detect if Recruiter
|
||||||
|
# Check the "Header" (first ~15 lines or first 1000 chars) for HR titles
|
||||||
|
header_text = "\n".join(lines[:15])
|
||||||
|
is_recruiter = bool(NON_TECH_ROLES_RE.search(header_text))
|
||||||
|
dbg["is_recruiter"] = is_recruiter
|
||||||
|
|
||||||
|
# 2. Extract Total Duration
|
||||||
|
if lines:
|
||||||
|
# Strategy A: Explicit summary
|
||||||
|
for i, ln in enumerate(lines[:200]):
|
||||||
|
if AGE_LINE_RE.search(ln): continue
|
||||||
|
|
||||||
|
# Look for summary line
|
||||||
|
if EXP_HEADER_RE.search(ln):
|
||||||
|
window = ln
|
||||||
|
if i + 1 < len(lines): window += " " + lines[i+1]
|
||||||
|
if i + 2 < len(lines): window += " " + lines[i+2]
|
||||||
|
|
||||||
|
m = EXP_SUMMARY_RE.search(window)
|
||||||
|
if m:
|
||||||
|
y = int(m.group("y"))
|
||||||
|
mm = int(m.group("m")) if m.group("m") else 0
|
||||||
|
total_years = float(round(y + (mm / 12.0), 2))
|
||||||
|
if 0 <= total_years <= 60:
|
||||||
|
dbg["method"] = "summary"
|
||||||
|
dbg["matched"] = m.group(0)
|
||||||
|
confidence = 0.95
|
||||||
|
break
|
||||||
|
|
||||||
|
# Strategy B: Fallback nearby
|
||||||
|
if total_years is None:
|
||||||
|
safe_lines = [ln for ln in lines if not AGE_LINE_RE.search(ln)]
|
||||||
|
for i, ln in enumerate(safe_lines):
|
||||||
|
if not EXP_HEADER_RE.search(ln): continue
|
||||||
|
chunk = " ".join(safe_lines[i : i + 12])
|
||||||
|
m = EXP_NEARBY_RE.search(chunk)
|
||||||
|
if m:
|
||||||
|
y = int(m.group("y"))
|
||||||
|
mm = int(m.group("m")) if m.group("m") else 0
|
||||||
|
val = float(round(y + (mm / 12.0), 2))
|
||||||
|
if 0 <= val <= 60:
|
||||||
|
total_years = val
|
||||||
|
dbg["method"] = "header_chunk"
|
||||||
|
dbg["matched"] = m.group(0)
|
||||||
|
confidence = 0.80
|
||||||
|
break
|
||||||
|
|
||||||
|
# 2.5 Timeline/range fallback-reconciliation
|
||||||
|
# Protects against cases where summary parser catches one short fragment
|
||||||
|
# while CV has a long timeline.
|
||||||
|
try:
|
||||||
|
alt = extract_experience(text or "")
|
||||||
|
except Exception:
|
||||||
|
alt = None
|
||||||
|
if alt and alt.years is not None:
|
||||||
|
if total_years is None:
|
||||||
|
total_years = alt.years
|
||||||
|
confidence = max(confidence, alt.confidence)
|
||||||
|
dbg["method"] = "timeline_fallback"
|
||||||
|
dbg["matched"] = "date_ranges"
|
||||||
|
elif alt.years > (total_years + 1.0):
|
||||||
|
strong_summary = str(dbg.get("method") or "") in ("summary", "header_chunk") and confidence >= 0.78
|
||||||
|
if strong_summary and (alt.years - float(total_years)) > 1.5:
|
||||||
|
dbg["reconcile"] = "timeline_skip_strong_summary"
|
||||||
|
else:
|
||||||
|
total_years = alt.years
|
||||||
|
confidence = max(confidence, min(0.82, alt.confidence))
|
||||||
|
dbg["method"] = "timeline_reconcile"
|
||||||
|
dbg["matched"] = "date_ranges"
|
||||||
|
|
||||||
|
# 3. Calculate Engineering Years
|
||||||
|
eng_years = total_years
|
||||||
|
if is_recruiter:
|
||||||
|
# If they are a recruiter, their "engineering" experience is effectively 0
|
||||||
|
# for the purpose of finding a Developer.
|
||||||
|
eng_years = 0.0
|
||||||
|
|
||||||
|
return total_years, eng_years, confidence, dbg
|
||||||
|
|
||||||
|
|
||||||
|
def _norm_phone(p: str) -> str:
|
||||||
|
digits = re.sub(r"\D+", "", p)
|
||||||
|
if digits.startswith("8") and len(digits) == 11:
|
||||||
|
digits = "7" + digits[1:]
|
||||||
|
return "+" + digits if digits else ""
|
||||||
|
|
||||||
|
def _norm_token(s: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", s.strip().lower())
|
||||||
|
|
||||||
|
def safe_json(v) -> str:
|
||||||
|
return json.dumps(v, ensure_ascii=False)
|
||||||
|
|
||||||
|
def extract_contacts(text: str) -> Dict[str, List[str]]:
|
||||||
|
emails_set = set(m.group(0).lower() for m in EMAIL_RE.finditer(text or ""))
|
||||||
|
for m in EMAIL_SPLIT_RE.finditer(text or ""):
|
||||||
|
prefix = m.group("prefix").strip().lower().strip(".-_")
|
||||||
|
if not prefix or prefix in _EMAIL_PREFIX_STOP:
|
||||||
|
continue
|
||||||
|
if not re.search(r"[._\-\d]", prefix):
|
||||||
|
continue
|
||||||
|
tail = m.group("tail").lower()
|
||||||
|
if "@" not in tail:
|
||||||
|
continue
|
||||||
|
local_tail, domain = tail.split("@", 1)
|
||||||
|
local = f"{prefix}{local_tail}"
|
||||||
|
if len(local) > 64:
|
||||||
|
continue
|
||||||
|
cand = f"{local}@{domain}"
|
||||||
|
if EMAIL_RE.fullmatch(cand):
|
||||||
|
emails_set.add(cand)
|
||||||
|
emails = _prune_fragment_emails(sorted(emails_set))
|
||||||
|
phones = sorted(set(_norm_phone(m.group(1)) for m in PHONE_RE.finditer(text) if _norm_phone(m.group(1))))
|
||||||
|
tg = sorted(set(m.group(1).lower() for m in TG_RE.finditer(text)))
|
||||||
|
gh = sorted(set(m.group(1).lower() for m in GITHUB_RE.finditer(text)))
|
||||||
|
li = sorted(set(m.group(1).lower() for m in LINKEDIN_RE.finditer(text)))
|
||||||
|
urls = sorted(set(m.group(0) for m in URL_RE.finditer(text)))
|
||||||
|
return {"emails": emails, "phones": phones, "telegram": tg, "github": gh, "linkedin": li, "urls": urls}
|
||||||
|
|
||||||
|
def extract_name_guess(text: str) -> Optional[str]:
|
||||||
|
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||||
|
if not lines:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 1) HH footer "Name • Резюме обновлено ..."
|
||||||
|
m = HH_FOOTER_RE.search(text or "")
|
||||||
|
if m:
|
||||||
|
cand = m.group("name").strip()
|
||||||
|
if _looks_like_name_line(cand):
|
||||||
|
return cand
|
||||||
|
|
||||||
|
# 2) Key-value line: "Name: ..." / "Имя: ..."
|
||||||
|
for ln in lines[:40]:
|
||||||
|
m2 = NAME_KV_RE.match(ln)
|
||||||
|
if m2:
|
||||||
|
cand = m2.group(2).strip()
|
||||||
|
cand = re.split(r"[|,/;]", cand)[0].strip()
|
||||||
|
if _looks_like_name_line(cand):
|
||||||
|
return cand
|
||||||
|
|
||||||
|
# 3) Name-like in first ~40 lines
|
||||||
|
for ln in lines[:40]:
|
||||||
|
if _looks_like_heading_line(ln):
|
||||||
|
continue
|
||||||
|
if _looks_like_name_line(ln):
|
||||||
|
return ln
|
||||||
|
|
||||||
|
# 4) Name-like near the end (pptx exports often put name there)
|
||||||
|
tail_start = max(0, len(lines) - 60)
|
||||||
|
for i in range(tail_start, len(lines)):
|
||||||
|
ln = lines[i]
|
||||||
|
if _looks_like_heading_line(ln):
|
||||||
|
continue
|
||||||
|
ctx = " ".join(lines[max(0, i - 2) : min(len(lines), i + 3)]).lower()
|
||||||
|
if NAME_INSTITUTION_RE.search(ctx):
|
||||||
|
continue
|
||||||
|
if _looks_like_name_line(ln):
|
||||||
|
return ln
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_heading_line(line: str) -> bool:
|
||||||
|
low = (line or "").strip().lower()
|
||||||
|
if not low:
|
||||||
|
return False
|
||||||
|
if low in _NAME_BAD_WORDS:
|
||||||
|
return True
|
||||||
|
if low.startswith("top skills"):
|
||||||
|
return True
|
||||||
|
if len(low.split()) <= 3 and any(w in low for w in ("skills", "experience", "education", "languages")):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_name_line(line: str) -> bool:
|
||||||
|
if not line:
|
||||||
|
return False
|
||||||
|
if len(line) > 80:
|
||||||
|
return False
|
||||||
|
low = line.lower().strip()
|
||||||
|
if low in NAME_STOPWORDS:
|
||||||
|
return False
|
||||||
|
if _looks_like_heading_line(line):
|
||||||
|
return False
|
||||||
|
if re.search(r"\b(resume|cv|резюме)\b", line, re.I):
|
||||||
|
return False
|
||||||
|
if NAME_INSTITUTION_RE.search(line):
|
||||||
|
return False
|
||||||
|
if not NAME_LINE_RE.match(line.strip()):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def extract_remote(text: str) -> Optional[bool]:
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
for ln in text.splitlines()[:120]:
|
||||||
|
if REMOTE_RE.search(ln):
|
||||||
|
return True
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_english(text: str) -> Optional[str]:
|
||||||
|
t = text or ""
|
||||||
|
lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
|
||||||
|
|
||||||
|
# 1) CEFR levels anywhere are accepted.
|
||||||
|
m = EN_RE.search(t)
|
||||||
|
if m:
|
||||||
|
return m.group(1).replace("+", "").upper()
|
||||||
|
|
||||||
|
# 2) Textual levels only when English context is present.
|
||||||
|
candidate_chunks: List[str] = []
|
||||||
|
for i, ln in enumerate(lines):
|
||||||
|
if EN_LANG_RE.search(ln):
|
||||||
|
candidate_chunks.append(ln)
|
||||||
|
if i + 1 < len(lines):
|
||||||
|
candidate_chunks.append(lines[i + 1])
|
||||||
|
|
||||||
|
if not candidate_chunks:
|
||||||
|
return None
|
||||||
|
|
||||||
|
m2 = EN_TEXT_RE.search("\n".join(candidate_chunks))
|
||||||
|
if not m2:
|
||||||
|
return None
|
||||||
|
word = m2.group(1).lower()
|
||||||
|
if word in ("native", "fluent", "proficient", "advanced"):
|
||||||
|
return "C1"
|
||||||
|
if word.startswith("upper"):
|
||||||
|
return "B2"
|
||||||
|
if word == "intermediate":
|
||||||
|
return "B1"
|
||||||
|
if word == "elementary":
|
||||||
|
return "A2"
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_roles_skills(text: str) -> Tuple[List[str], List[str]]:
|
||||||
|
"""
|
||||||
|
Extracts roles and skills, but strictly filters out HR/Recruitment context.
|
||||||
|
"""
|
||||||
|
lines = text.splitlines()
|
||||||
|
|
||||||
|
# 1. Filter text: Remove lines that talk about hiring/vacancies
|
||||||
|
clean_lines = []
|
||||||
|
for ln in lines:
|
||||||
|
if not HR_CONTEXT_RE.search(ln):
|
||||||
|
clean_lines.append(ln)
|
||||||
|
|
||||||
|
clean_text = "\n".join(clean_lines).lower()
|
||||||
|
|
||||||
|
# 2. Extract Skills from clean text only
|
||||||
|
skills = []
|
||||||
|
for s, pat in _SKILL_PATTERNS:
|
||||||
|
if pat.search(clean_text):
|
||||||
|
skills.append(normalize_skill(s) or s)
|
||||||
|
skills = sorted(set(skills))
|
||||||
|
|
||||||
|
# 3. Extract Roles
|
||||||
|
# Priority: Header (first 10 lines)
|
||||||
|
header_text = "\n".join(lines[:10]).lower()
|
||||||
|
|
||||||
|
found_roles = set()
|
||||||
|
|
||||||
|
# Check if Recruiter
|
||||||
|
if NON_TECH_ROLES_RE.search(header_text):
|
||||||
|
# If explicit recruiter in header, do NOT add generic tech roles like "backend"
|
||||||
|
# even if they appear in the text (often describes who they hire).
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# Normal extraction
|
||||||
|
for r in ROLES:
|
||||||
|
pats = _ROLE_PATTERNS.get(r, [])
|
||||||
|
if any(p.search(clean_text) for p in pats):
|
||||||
|
# extra guard: devops requires explicit evidence, not just CI/CD mentions
|
||||||
|
if r == "devops":
|
||||||
|
if not re.search(r"\b(devops|dev ops|sre|platform engineer|infrastructure)\b", clean_text, re.I):
|
||||||
|
continue
|
||||||
|
found_roles.add(r)
|
||||||
|
|
||||||
|
return sorted(list(found_roles)), skills
|
||||||
|
|
||||||
|
def norm_pipe(tokens: List[str]) -> str:
|
||||||
|
toks = [_norm_token(t) for t in tokens if _norm_token(t)]
|
||||||
|
uniq = sorted(set(toks))
|
||||||
|
return "|" + "|".join(uniq) + "|" if uniq else "|"
|
||||||
|
|
||||||
|
def extract_salary(text: str) -> Tuple[Optional[int], Optional[int], float, Dict]:
|
||||||
|
dbg: Dict[str, Any] = {"numbers": [], "currency_hits": 0, "hint_lines": 0, "used_lines": []}
|
||||||
|
lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||||||
|
if not lines:
|
||||||
|
return None, None, 0.0, dbg
|
||||||
|
|
||||||
|
candidates: List[Tuple[int, str, bool, bool]] = []
|
||||||
|
for i, ln in enumerate(lines):
|
||||||
|
has_hint = SALARY_HINT_RE.search(ln) is not None
|
||||||
|
has_pay = PAY_TOKEN_RE.search(ln) is not None
|
||||||
|
if not has_hint and not has_pay:
|
||||||
|
continue
|
||||||
|
if SALARY_NOISE_RE.search(ln) and not has_hint:
|
||||||
|
continue
|
||||||
|
candidates.append((i, ln, has_hint, has_pay))
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return None, None, 0.0, dbg
|
||||||
|
|
||||||
|
has_hint = any(x[2] for x in candidates)
|
||||||
|
if not has_hint:
|
||||||
|
# Inline pay without "salary" is allowed only near header/contact block.
|
||||||
|
candidates = [x for x in candidates if x[0] < 15]
|
||||||
|
if not candidates:
|
||||||
|
return None, None, 0.0, dbg
|
||||||
|
|
||||||
|
scan_chunks: List[str] = []
|
||||||
|
for i, ln, hint, _ in candidates:
|
||||||
|
chunk = ln
|
||||||
|
if hint and (i + 1) < len(lines):
|
||||||
|
chunk = f"{chunk} {lines[i + 1]}"
|
||||||
|
scan_chunks.append(chunk)
|
||||||
|
dbg["used_lines"].append(ln)
|
||||||
|
if hint:
|
||||||
|
dbg["hint_lines"] += 1
|
||||||
|
dbg["currency_hits"] += len(CURRENCY_RE.findall(chunk))
|
||||||
|
|
||||||
|
nums: List[int] = []
|
||||||
|
for chunk in scan_chunks:
|
||||||
|
for m in NUM_RE.finditer(chunk):
|
||||||
|
val = None
|
||||||
|
if m.group(1) and m.group(2):
|
||||||
|
val = int(m.group(1)) * 1000
|
||||||
|
elif m.group(3):
|
||||||
|
val = int(re.sub(r"\s+", "", m.group(3)))
|
||||||
|
elif m.group(4):
|
||||||
|
val = int(m.group(4))
|
||||||
|
if val and 20_000 <= val <= 30_000_000:
|
||||||
|
nums.append(val)
|
||||||
|
dbg["numbers"].append(val)
|
||||||
|
|
||||||
|
if not nums:
|
||||||
|
return None, None, 0.0, dbg
|
||||||
|
|
||||||
|
nums = sorted(nums)
|
||||||
|
salary_min = nums[0]
|
||||||
|
salary_max = nums[-1] if len(nums) > 1 else nums[0]
|
||||||
|
|
||||||
|
if dbg["hint_lines"] > 0:
|
||||||
|
conf = 0.82 if dbg["currency_hits"] > 0 else 0.70
|
||||||
|
else:
|
||||||
|
conf = 0.58 if dbg["currency_hits"] > 0 else 0.0
|
||||||
|
|
||||||
|
if salary_max > salary_min * 4:
|
||||||
|
conf -= 0.12
|
||||||
|
if len(nums) == 1:
|
||||||
|
conf -= 0.06
|
||||||
|
|
||||||
|
conf = max(0.0, min(conf, 0.9))
|
||||||
|
if conf < 0.45:
|
||||||
|
return None, None, conf, dbg
|
||||||
|
return salary_min, salary_max, conf, dbg
|
||||||
|
|
||||||
|
def extract_location_best_effort(text: str) -> Optional[str]:
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _clean_loc(val: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", (val or "").strip(" |,;"))
|
||||||
|
|
||||||
|
def _is_loc_like(val: str, *, allow_single: bool = False) -> bool:
|
||||||
|
v = _clean_loc(val)
|
||||||
|
if not v or len(v) < 3 or len(v) > 90:
|
||||||
|
return False
|
||||||
|
if re.search(r"[@/\\]", v) or re.search(r"\d{3,}", v):
|
||||||
|
return False
|
||||||
|
if SECTION_HEADER_RE.match(v):
|
||||||
|
return False
|
||||||
|
if LOCATION_CITY_COUNTRY_RE.match(v):
|
||||||
|
return True
|
||||||
|
if allow_single and re.fullmatch(r"[A-Za-zА-Яа-я][A-Za-zА-Яа-я' .\-]{1,40}", v):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
patterns = [
|
||||||
|
re.compile(r"(?i)\b(location|город|city)\s*:\s*(.{2,40})"),
|
||||||
|
re.compile(r"(?i)\b(место)\s*:\s*(.{2,40})"),
|
||||||
|
re.compile(r"(?i)\b(проживает|проживание)\s*:\s*(.{2,60})"),
|
||||||
|
]
|
||||||
|
for p in patterns:
|
||||||
|
m = p.search(text)
|
||||||
|
if m:
|
||||||
|
val = _clean_loc(m.group(2))
|
||||||
|
if _is_loc_like(val, allow_single=True):
|
||||||
|
return val
|
||||||
|
|
||||||
|
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||||
|
head: List[str] = []
|
||||||
|
for ln in lines[:60]:
|
||||||
|
if SECTION_HEADER_RE.match(ln):
|
||||||
|
low = ln.lower()
|
||||||
|
if low in ("contacts", "contact", "contact info"):
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
head.append(ln)
|
||||||
|
|
||||||
|
for ln in head:
|
||||||
|
parts = [ln] + [seg.strip() for seg in ln.split("|") if seg.strip()]
|
||||||
|
for seg in parts:
|
||||||
|
if _is_loc_like(seg):
|
||||||
|
return _clean_loc(seg)
|
||||||
|
return None
|
||||||
211
extract/pdf_extract.py
Normal file
211
extract/pdf_extract.py
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
try: # optional dependency
|
||||||
|
from pypdf import PdfReader # type: ignore
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
try:
|
||||||
|
from PyPDF2 import PdfReader # type: ignore
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
PdfReader = None # type: ignore
|
||||||
|
|
||||||
|
try: # optional dependency
|
||||||
|
from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
pdfminer_extract_text = None # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PdfExtractResult:
|
||||||
|
text: str
|
||||||
|
pages: List[dict]
|
||||||
|
method: str
|
||||||
|
score: float
|
||||||
|
flags: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
_SECTION_HINTS = [
|
||||||
|
"experience", "work experience", "skills", "education", "projects", "summary", "about",
|
||||||
|
"опыт работы", "навыки", "образование", "проекты", "о себе",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _which_pdftotext() -> Optional[str]:
|
||||||
|
exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
|
||||||
|
return exe
|
||||||
|
|
||||||
|
|
||||||
|
def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
|
||||||
|
exe = _which_pdftotext()
|
||||||
|
if not exe:
|
||||||
|
return ""
|
||||||
|
cmd = [exe]
|
||||||
|
if layout:
|
||||||
|
cmd.append("-layout")
|
||||||
|
cmd += ["-nopgbrk", str(path), "-"]
|
||||||
|
try:
|
||||||
|
p = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
timeout=timeout_sec,
|
||||||
|
check=False,
|
||||||
|
text=True,
|
||||||
|
encoding="utf-8",
|
||||||
|
errors="ignore",
|
||||||
|
)
|
||||||
|
return (p.stdout or "").strip()
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
|
||||||
|
if PdfReader is None:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
reader = PdfReader(str(path), strict=False)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
pages: List[dict] = []
|
||||||
|
for i, page in enumerate(getattr(reader, "pages", [])):
|
||||||
|
if max_pages and i >= max_pages:
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
except Exception:
|
||||||
|
text = ""
|
||||||
|
pages.append({"page": i + 1, "text": text})
|
||||||
|
return pages
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdfminer(path: Path) -> str:
|
||||||
|
if pdfminer_extract_text is None:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
return (pdfminer_extract_text(str(path)) or "").strip()
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _quality_score(text: str) -> Tuple[float, List[str]]:
|
||||||
|
flags: List[str] = []
|
||||||
|
if not text:
|
||||||
|
return 0.0, ["empty"]
|
||||||
|
|
||||||
|
total = len(text)
|
||||||
|
letters = sum(ch.isalpha() for ch in text)
|
||||||
|
spaces = text.count(" ")
|
||||||
|
alpha_ratio = letters / max(1, total)
|
||||||
|
space_ratio = spaces / max(1, total)
|
||||||
|
|
||||||
|
words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
|
||||||
|
avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
|
||||||
|
|
||||||
|
lines = [ln for ln in text.splitlines() if ln.strip()]
|
||||||
|
long_lines = [ln for ln in lines if len(ln) > 200]
|
||||||
|
long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
|
||||||
|
|
||||||
|
glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
|
||||||
|
|
||||||
|
section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
|
||||||
|
|
||||||
|
score = 0.0
|
||||||
|
if alpha_ratio >= 0.45:
|
||||||
|
score += 2.0
|
||||||
|
elif alpha_ratio >= 0.30:
|
||||||
|
score += 1.0
|
||||||
|
else:
|
||||||
|
flags.append("low_alpha")
|
||||||
|
|
||||||
|
if 0.10 <= space_ratio <= 0.28:
|
||||||
|
score += 1.0
|
||||||
|
else:
|
||||||
|
flags.append("odd_spacing")
|
||||||
|
|
||||||
|
if 3.5 <= avg_word_len <= 9.0:
|
||||||
|
score += 1.0
|
||||||
|
else:
|
||||||
|
flags.append("odd_word_len")
|
||||||
|
|
||||||
|
if long_line_ratio <= 0.06:
|
||||||
|
score += 1.0
|
||||||
|
else:
|
||||||
|
flags.append("long_lines")
|
||||||
|
|
||||||
|
if glued_hits <= 6:
|
||||||
|
score += 1.0
|
||||||
|
else:
|
||||||
|
flags.append("glued_text")
|
||||||
|
|
||||||
|
if section_hits >= 2:
|
||||||
|
score += 1.0
|
||||||
|
elif section_hits == 1:
|
||||||
|
score += 0.5
|
||||||
|
|
||||||
|
if total < 200:
|
||||||
|
flags.append("short_text")
|
||||||
|
|
||||||
|
if alpha_ratio < 0.08 or total < 120:
|
||||||
|
flags.append("scan_like")
|
||||||
|
|
||||||
|
return score, flags
|
||||||
|
|
||||||
|
|
||||||
|
def deglue_text(text: str) -> str:
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
t = text
|
||||||
|
t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
|
||||||
|
t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
|
||||||
|
t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
|
||||||
|
t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
|
||||||
|
candidates: List[Tuple[str, str]] = []
|
||||||
|
|
||||||
|
txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
|
||||||
|
if txt_layout:
|
||||||
|
candidates.append(("pdftotext_layout", txt_layout))
|
||||||
|
|
||||||
|
txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
|
||||||
|
if txt_plain:
|
||||||
|
candidates.append(("pdftotext_plain", txt_plain))
|
||||||
|
|
||||||
|
txt_pypdf = ""
|
||||||
|
if PdfReader is not None:
|
||||||
|
pages = _extract_pages_pypdf(path)
|
||||||
|
if pages:
|
||||||
|
txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
|
||||||
|
if txt_pypdf:
|
||||||
|
candidates.append(("pypdf", txt_pypdf))
|
||||||
|
|
||||||
|
txt_pdfminer = _extract_pdfminer(path)
|
||||||
|
if txt_pdfminer:
|
||||||
|
candidates.append(("pdfminer", txt_pdfminer))
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
|
||||||
|
|
||||||
|
best_method = "none"
|
||||||
|
best_text = ""
|
||||||
|
best_score = -1.0
|
||||||
|
best_flags: List[str] = []
|
||||||
|
for method, text in candidates:
|
||||||
|
score, flags = _quality_score(text)
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_method = method
|
||||||
|
best_text = text
|
||||||
|
best_flags = flags
|
||||||
|
|
||||||
|
pages = _extract_pages_pypdf(path)
|
||||||
|
best_text = deglue_text(best_text)
|
||||||
|
return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)
|
||||||
70
extract/sections.py
Normal file
70
extract/sections.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
_SECTION_PATTERNS: dict[str, List[re.Pattern]] = {
|
||||||
|
"contacts": [
|
||||||
|
re.compile(r"^\s*(contacts?|contact info|контакты)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"about": [
|
||||||
|
re.compile(r"^\s*(summary|about|profile|objective|о\s+себе|обо\s+мне|профиль|цель)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"skills": [
|
||||||
|
re.compile(r"^\s*(skills?|key skills|stack|tech( stack)?|навыки|технологии|компетенции)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"experience": [
|
||||||
|
re.compile(r"^\s*(experience|work experience|employment|опыт\s+работы|опыт)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"education": [
|
||||||
|
re.compile(r"^\s*(education|образование|курсы|certifications?|сертификаты)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"projects": [
|
||||||
|
re.compile(r"^\s*(projects?|проекты)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"languages": [
|
||||||
|
re.compile(r"^\s*(languages?|языки)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"certifications": [
|
||||||
|
re.compile(r"^\s*(certifications?|сертификаты|курсы)\s*$", re.I),
|
||||||
|
],
|
||||||
|
"publications": [
|
||||||
|
re.compile(r"^\s*(publications?|публикации)\s*$", re.I),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _match_header(line: str) -> Optional[str]:
|
||||||
|
for key, patterns in _SECTION_PATTERNS.items():
|
||||||
|
for rx in patterns:
|
||||||
|
if rx.match(line):
|
||||||
|
return key
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def split_sections(clean_text: str, doc_type: str | None = None) -> Dict[str, str]:
|
||||||
|
lines = [ln.strip() for ln in (clean_text or "").splitlines()]
|
||||||
|
sections: Dict[str, List[str]] = {"header": []}
|
||||||
|
current = "header"
|
||||||
|
|
||||||
|
for ln in lines:
|
||||||
|
if not ln:
|
||||||
|
continue
|
||||||
|
key = _match_header(ln)
|
||||||
|
if key:
|
||||||
|
current = key
|
||||||
|
sections.setdefault(current, [])
|
||||||
|
continue
|
||||||
|
sections.setdefault(current, []).append(ln)
|
||||||
|
|
||||||
|
out: Dict[str, str] = {}
|
||||||
|
for k, vals in sections.items():
|
||||||
|
text = "\n".join(vals).strip()
|
||||||
|
if text:
|
||||||
|
out[k] = text
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def sections_present(sections: Dict[str, str]) -> List[str]:
|
||||||
|
return sorted([k for k, v in (sections or {}).items() if v and k != "header"])
|
||||||
1
extract/templates/__init__.py
Normal file
1
extract/templates/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
__all__ = []
|
||||||
46
extract/templates/generic.py
Normal file
46
extract/templates/generic.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from tg_resume_db.extract.parse import (
|
||||||
|
extract_contacts,
|
||||||
|
extract_name_guess,
|
||||||
|
extract_remote,
|
||||||
|
extract_english,
|
||||||
|
extract_roles_skills,
|
||||||
|
extract_salary,
|
||||||
|
extract_location_best_effort,
|
||||||
|
extract_experience_years,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
text = clean_text or ""
|
||||||
|
|
||||||
|
contacts_raw = extract_contacts(text)
|
||||||
|
name = extract_name_guess(text)
|
||||||
|
remote = extract_remote(text)
|
||||||
|
english = extract_english(text)
|
||||||
|
roles, skills = extract_roles_skills(text)
|
||||||
|
location = extract_location_best_effort(text)
|
||||||
|
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
|
||||||
|
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"contacts_raw": contacts_raw,
|
||||||
|
"remote": remote,
|
||||||
|
"english": english,
|
||||||
|
"roles": roles,
|
||||||
|
"skills": skills,
|
||||||
|
"location": location,
|
||||||
|
"exp_years": exp_years,
|
||||||
|
"exp_years_eng": exp_years_eng,
|
||||||
|
"exp_conf": exp_conf,
|
||||||
|
"exp_dbg": exp_dbg,
|
||||||
|
"salary_min": sal_min,
|
||||||
|
"salary_max": sal_max,
|
||||||
|
"salary_conf": sal_conf,
|
||||||
|
"salary_dbg": sal_dbg,
|
||||||
|
"parse_method": "generic_heur",
|
||||||
|
}
|
||||||
58
extract/templates/hh.py
Normal file
58
extract/templates/hh.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from tg_resume_db.extract.parse import (
|
||||||
|
extract_contacts,
|
||||||
|
extract_name_guess,
|
||||||
|
extract_remote,
|
||||||
|
extract_english,
|
||||||
|
extract_roles_skills,
|
||||||
|
extract_salary,
|
||||||
|
extract_location_best_effort,
|
||||||
|
extract_experience_years,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
|
||||||
|
if not sections:
|
||||||
|
return fallback
|
||||||
|
return sections.get(key) or fallback
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
header_text = _pick(sections, "header", clean_text)
|
||||||
|
contacts_text = _pick(sections, "contacts", clean_text)
|
||||||
|
about_text = _pick(sections, "about", clean_text)
|
||||||
|
skills_text = _pick(sections, "skills", clean_text)
|
||||||
|
exp_text = _pick(sections, "experience", clean_text)
|
||||||
|
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
|
||||||
|
|
||||||
|
name = extract_name_guess(header_text)
|
||||||
|
contacts_raw = extract_contacts(contacts_text)
|
||||||
|
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
|
||||||
|
|
||||||
|
remote = extract_remote(clean_text)
|
||||||
|
english = extract_english(clean_text)
|
||||||
|
location = extract_location_best_effort(clean_text)
|
||||||
|
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
|
||||||
|
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"contacts_raw": contacts_raw,
|
||||||
|
"remote": remote,
|
||||||
|
"english": english,
|
||||||
|
"roles": roles,
|
||||||
|
"skills": skills,
|
||||||
|
"location": location,
|
||||||
|
"exp_years": exp_years,
|
||||||
|
"exp_years_eng": exp_years_eng,
|
||||||
|
"exp_conf": exp_conf,
|
||||||
|
"exp_dbg": exp_dbg,
|
||||||
|
"salary_min": sal_min,
|
||||||
|
"salary_max": sal_max,
|
||||||
|
"salary_conf": sal_conf,
|
||||||
|
"salary_dbg": sal_dbg,
|
||||||
|
"parse_method": "hh_template",
|
||||||
|
}
|
||||||
85
extract/templates/hh_ru.py
Normal file
85
extract/templates/hh_ru.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from tg_resume_db.extract.parse import (
|
||||||
|
extract_contacts,
|
||||||
|
extract_name_guess,
|
||||||
|
extract_remote,
|
||||||
|
extract_english,
|
||||||
|
extract_roles_skills,
|
||||||
|
extract_salary,
|
||||||
|
extract_location_best_effort,
|
||||||
|
extract_experience_years,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_DESIRED_RE = re.compile(r"(?i)жел[а-я]*\s+должност[ьи]\s*[:\-]?\s*(.+)")
|
||||||
|
_SPEC_RE = re.compile(r"(?i)специализаци[яи]\s*[:\-]?\s*(.+)")
|
||||||
|
_SCHEDULE_RE = re.compile(r"(?i)график\s+работы\s*[:\-]?\s*(.+)")
|
||||||
|
_EMPLOYMENT_RE = re.compile(r"(?i)занятость\s*[:\-]?\s*(.+)")
|
||||||
|
|
||||||
|
|
||||||
|
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
|
||||||
|
if not sections:
|
||||||
|
return fallback
|
||||||
|
return sections.get(key) or fallback
|
||||||
|
|
||||||
|
|
||||||
|
def _find_first(regex: re.Pattern, text: str) -> Optional[str]:
|
||||||
|
for ln in text.splitlines():
|
||||||
|
m = regex.search(ln)
|
||||||
|
if m:
|
||||||
|
val = m.group(1).strip()
|
||||||
|
val = re.split(r"[|;/]", val)[0].strip()
|
||||||
|
if 2 <= len(val) <= 80:
|
||||||
|
return val
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
header_text = _pick(sections, "header", clean_text)
|
||||||
|
contacts_text = _pick(sections, "contacts", clean_text)
|
||||||
|
about_text = _pick(sections, "about", clean_text)
|
||||||
|
skills_text = _pick(sections, "skills", clean_text)
|
||||||
|
exp_text = _pick(sections, "experience", clean_text)
|
||||||
|
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
|
||||||
|
|
||||||
|
name = extract_name_guess(header_text)
|
||||||
|
contacts_raw = extract_contacts(contacts_text)
|
||||||
|
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
|
||||||
|
|
||||||
|
remote = extract_remote(clean_text)
|
||||||
|
english = extract_english(clean_text)
|
||||||
|
location = extract_location_best_effort(clean_text)
|
||||||
|
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
|
||||||
|
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
|
||||||
|
|
||||||
|
desired_title = _find_first(_DESIRED_RE, clean_text)
|
||||||
|
specializations = _find_first(_SPEC_RE, clean_text)
|
||||||
|
schedule = _find_first(_SCHEDULE_RE, clean_text)
|
||||||
|
employment = _find_first(_EMPLOYMENT_RE, clean_text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"contacts_raw": contacts_raw,
|
||||||
|
"remote": remote,
|
||||||
|
"english": english,
|
||||||
|
"roles": roles,
|
||||||
|
"skills": skills,
|
||||||
|
"location": location,
|
||||||
|
"exp_years": exp_years,
|
||||||
|
"exp_years_eng": exp_years_eng,
|
||||||
|
"exp_conf": exp_conf,
|
||||||
|
"exp_dbg": exp_dbg,
|
||||||
|
"salary_min": sal_min,
|
||||||
|
"salary_max": sal_max,
|
||||||
|
"salary_conf": sal_conf,
|
||||||
|
"salary_dbg": sal_dbg,
|
||||||
|
"desired_title": desired_title,
|
||||||
|
"specializations": specializations,
|
||||||
|
"employment_type": employment,
|
||||||
|
"schedule": schedule,
|
||||||
|
"parse_method": "hh_template",
|
||||||
|
}
|
||||||
57
extract/templates/linkedin.py
Normal file
57
extract/templates/linkedin.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from tg_resume_db.extract.parse import (
|
||||||
|
extract_contacts,
|
||||||
|
extract_name_guess,
|
||||||
|
extract_remote,
|
||||||
|
extract_english,
|
||||||
|
extract_roles_skills,
|
||||||
|
extract_salary,
|
||||||
|
extract_location_best_effort,
|
||||||
|
extract_experience_years,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pick(sections: Dict[str, str] | None, key: str, fallback: str) -> str:
|
||||||
|
if not sections:
|
||||||
|
return fallback
|
||||||
|
return sections.get(key) or fallback
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
header_text = _pick(sections, "header", clean_text)
|
||||||
|
about_text = _pick(sections, "about", clean_text)
|
||||||
|
skills_text = _pick(sections, "skills", clean_text)
|
||||||
|
exp_text = _pick(sections, "experience", clean_text)
|
||||||
|
exp_scope = "\n".join([about_text, exp_text]).strip() or exp_text
|
||||||
|
|
||||||
|
name = extract_name_guess(header_text)
|
||||||
|
contacts_raw = extract_contacts(clean_text)
|
||||||
|
roles, skills = extract_roles_skills("\n".join([about_text, skills_text, exp_text]))
|
||||||
|
|
||||||
|
remote = extract_remote(clean_text)
|
||||||
|
english = extract_english(clean_text)
|
||||||
|
location = extract_location_best_effort(clean_text)
|
||||||
|
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(exp_scope)
|
||||||
|
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(clean_text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"contacts_raw": contacts_raw,
|
||||||
|
"remote": remote,
|
||||||
|
"english": english,
|
||||||
|
"roles": roles,
|
||||||
|
"skills": skills,
|
||||||
|
"location": location,
|
||||||
|
"exp_years": exp_years,
|
||||||
|
"exp_years_eng": exp_years_eng,
|
||||||
|
"exp_conf": exp_conf,
|
||||||
|
"exp_dbg": exp_dbg,
|
||||||
|
"salary_min": sal_min,
|
||||||
|
"salary_max": sal_max,
|
||||||
|
"salary_conf": sal_conf,
|
||||||
|
"salary_dbg": sal_dbg,
|
||||||
|
"parse_method": "linkedin_template",
|
||||||
|
}
|
||||||
46
extract/templates/one_page.py
Normal file
46
extract/templates/one_page.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from tg_resume_db.extract.parse import (
|
||||||
|
extract_contacts,
|
||||||
|
extract_name_guess,
|
||||||
|
extract_remote,
|
||||||
|
extract_english,
|
||||||
|
extract_roles_skills,
|
||||||
|
extract_salary,
|
||||||
|
extract_location_best_effort,
|
||||||
|
extract_experience_years,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
text = clean_text or ""
|
||||||
|
|
||||||
|
contacts_raw = extract_contacts(text)
|
||||||
|
name = extract_name_guess(text)
|
||||||
|
roles, skills = extract_roles_skills(text)
|
||||||
|
remote = extract_remote(text)
|
||||||
|
english = extract_english(text)
|
||||||
|
location = extract_location_best_effort(text)
|
||||||
|
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
|
||||||
|
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"contacts_raw": contacts_raw,
|
||||||
|
"remote": remote,
|
||||||
|
"english": english,
|
||||||
|
"roles": roles,
|
||||||
|
"skills": skills,
|
||||||
|
"location": location,
|
||||||
|
"exp_years": exp_years,
|
||||||
|
"exp_years_eng": exp_years_eng,
|
||||||
|
"exp_conf": exp_conf,
|
||||||
|
"exp_dbg": exp_dbg,
|
||||||
|
"salary_min": sal_min,
|
||||||
|
"salary_max": sal_max,
|
||||||
|
"salary_conf": sal_conf,
|
||||||
|
"salary_dbg": sal_dbg,
|
||||||
|
"parse_method": "one_page_template",
|
||||||
|
}
|
||||||
11
extract/templates/one_page_en.py
Normal file
11
extract/templates/one_page_en.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from tg_resume_db.extract.templates.one_page import parse_resume as _parse
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
out = _parse(clean_text, sections)
|
||||||
|
out["parse_method"] = "one_page_en"
|
||||||
|
return out
|
||||||
11
extract/templates/one_page_ru.py
Normal file
11
extract/templates/one_page_ru.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from tg_resume_db.extract.templates.one_page import parse_resume as _parse
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
out = _parse(clean_text, sections)
|
||||||
|
out["parse_method"] = "one_page_ru"
|
||||||
|
return out
|
||||||
45
extract/templates/pptx_export.py
Normal file
45
extract/templates/pptx_export.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from tg_resume_db.extract.parse import (
|
||||||
|
extract_contacts,
|
||||||
|
extract_name_guess,
|
||||||
|
extract_remote,
|
||||||
|
extract_english,
|
||||||
|
extract_roles_skills,
|
||||||
|
extract_salary,
|
||||||
|
extract_location_best_effort,
|
||||||
|
extract_experience_years,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_resume(clean_text: str, sections: Dict[str, str] | None = None) -> Dict[str, Any]:
|
||||||
|
text = clean_text or ""
|
||||||
|
contacts_raw = extract_contacts(text)
|
||||||
|
name = extract_name_guess(text)
|
||||||
|
roles, skills = extract_roles_skills(text)
|
||||||
|
remote = extract_remote(text)
|
||||||
|
english = extract_english(text)
|
||||||
|
location = extract_location_best_effort(text)
|
||||||
|
exp_years, exp_years_eng, exp_conf, exp_dbg = extract_experience_years(text)
|
||||||
|
sal_min, sal_max, sal_conf, sal_dbg = extract_salary(text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"contacts_raw": contacts_raw,
|
||||||
|
"remote": remote,
|
||||||
|
"english": english,
|
||||||
|
"roles": roles,
|
||||||
|
"skills": skills,
|
||||||
|
"location": location,
|
||||||
|
"exp_years": exp_years,
|
||||||
|
"exp_years_eng": exp_years_eng,
|
||||||
|
"exp_conf": exp_conf,
|
||||||
|
"exp_dbg": exp_dbg,
|
||||||
|
"salary_min": sal_min,
|
||||||
|
"salary_max": sal_max,
|
||||||
|
"salary_conf": sal_conf,
|
||||||
|
"salary_dbg": sal_dbg,
|
||||||
|
"parse_method": "pptx_template",
|
||||||
|
}
|
||||||
99
extract/text_extract.py
Normal file
99
extract/text_extract.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
try: # optional dependency for PDF fallback
|
||||||
|
from pypdf import PdfReader as _PdfReader # type: ignore
|
||||||
|
except Exception: # pragma: no cover - optional import
|
||||||
|
try:
|
||||||
|
from PyPDF2 import PdfReader as _PdfReader # type: ignore
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
_PdfReader = None # type: ignore
|
||||||
|
|
||||||
|
def _read_bytes(path: Path) -> bytes:
|
||||||
|
return path.read_bytes()
|
||||||
|
|
||||||
|
def extract_text_from_txt(path: Path) -> str:
|
||||||
|
data = _read_bytes(path)
|
||||||
|
for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
|
||||||
|
try:
|
||||||
|
return data.decode(enc, errors="ignore")
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return data.decode("utf-8", errors="ignore")
|
||||||
|
|
||||||
|
def extract_text_from_html(path: Path) -> str:
|
||||||
|
html = extract_text_from_txt(path)
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
return soup.get_text("\n", strip=True)
|
||||||
|
|
||||||
|
def extract_text_from_docx(path: Path) -> str:
|
||||||
|
from docx import Document
|
||||||
|
doc = Document(str(path))
|
||||||
|
parts = []
|
||||||
|
for p in doc.paragraphs:
|
||||||
|
if p.text and p.text.strip():
|
||||||
|
parts.append(p.text.strip())
|
||||||
|
for table in doc.tables:
|
||||||
|
for row in table.rows:
|
||||||
|
cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
|
||||||
|
if cells:
|
||||||
|
parts.append(" | ".join(cells))
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
|
||||||
|
# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
|
||||||
|
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
||||||
|
logging.getLogger("PyPDF2").setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_pdf(path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
|
||||||
|
Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
|
||||||
|
"""
|
||||||
|
if _PdfReader is None:
|
||||||
|
raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
reader = _PdfReader(str(path), strict=False)
|
||||||
|
except Exception as exc: # pragma: no cover - pdf parser edge cases
|
||||||
|
raise RuntimeError(f"PDF read failed: {exc}") from exc
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
for idx, page in enumerate(getattr(reader, "pages", [])):
|
||||||
|
if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
text = page.extract_text() # type: ignore[attr-defined]
|
||||||
|
except Exception:
|
||||||
|
text = None
|
||||||
|
if text:
|
||||||
|
parts.append(text)
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
def extract_text_from_doc_best_effort(path: Path) -> str:
|
||||||
|
# .doc requires external tools; best-effort if textract installed
|
||||||
|
try:
|
||||||
|
import textract # type: ignore
|
||||||
|
b = textract.process(str(path))
|
||||||
|
return b.decode("utf-8", errors="ignore")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def extract_text(path: Path) -> str:
|
||||||
|
ext = path.suffix.lower()
|
||||||
|
if ext in (".txt", ".log"):
|
||||||
|
return extract_text_from_txt(path)
|
||||||
|
if ext in (".html", ".htm"):
|
||||||
|
return extract_text_from_html(path)
|
||||||
|
if ext == ".docx":
|
||||||
|
return extract_text_from_docx(path)
|
||||||
|
if ext == ".pdf":
|
||||||
|
return extract_text_from_pdf(path)
|
||||||
|
if ext == ".doc":
|
||||||
|
return extract_text_from_doc_best_effort(path)
|
||||||
|
return ""
|
||||||
21
importers/file_scan.py
Normal file
21
importers/file_scan.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterator
|
||||||
|
|
||||||
|
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
||||||
|
|
||||||
|
def iter_files(root: Path) -> Iterator[Dict]:
|
||||||
|
for p in root.rglob("*"):
|
||||||
|
if p.is_file() and p.suffix.lower() in RESUME_EXTS:
|
||||||
|
yield {
|
||||||
|
"origin_type": "file_scan",
|
||||||
|
"export_path": str(root),
|
||||||
|
"chat_title": None,
|
||||||
|
"message_id": None,
|
||||||
|
"message_date": None,
|
||||||
|
"message_text": "",
|
||||||
|
"file_path": str(p.resolve()),
|
||||||
|
"original_name": p.name,
|
||||||
|
"extra": {},
|
||||||
|
}
|
||||||
66
importers/telegram_html.py
Normal file
66
importers/telegram_html.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterator, List, Optional
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
||||||
|
|
||||||
|
def find_messages_html(root: Path) -> List[Path]:
|
||||||
|
return [p for p in root.rglob("messages*.html") if p.is_file()]
|
||||||
|
|
||||||
|
def iter_artifacts(messages_html: Path) -> Iterator[Dict]:
|
||||||
|
html = messages_html.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
|
||||||
|
chat_title = None
|
||||||
|
h = soup.find(class_=re.compile(r"page_header", re.I))
|
||||||
|
if h:
|
||||||
|
chat_title = h.get_text(" ", strip=True)
|
||||||
|
chat_title = chat_title or messages_html.parent.name
|
||||||
|
|
||||||
|
for msg in soup.select(".message.default.clearfix, .message"):
|
||||||
|
message_id = msg.get("id") or None
|
||||||
|
date_div = msg.select_one(".date")
|
||||||
|
msg_date = date_div.get("title") if date_div else None
|
||||||
|
|
||||||
|
text_div = msg.select_one(".text")
|
||||||
|
msg_text = text_div.get_text("\n", strip=True) if text_div else ""
|
||||||
|
|
||||||
|
file_path = None
|
||||||
|
original_name = None
|
||||||
|
for a in msg.find_all("a", href=True):
|
||||||
|
href = a["href"]
|
||||||
|
p = (messages_html.parent / href).resolve()
|
||||||
|
if p.exists() and p.suffix.lower() in RESUME_EXTS:
|
||||||
|
file_path = str(p)
|
||||||
|
original_name = p.name
|
||||||
|
break
|
||||||
|
|
||||||
|
if file_path:
|
||||||
|
yield {
|
||||||
|
"origin_type": "telegram_html",
|
||||||
|
"export_path": str(messages_html.parent),
|
||||||
|
"chat_title": chat_title,
|
||||||
|
"message_id": str(message_id) if message_id else None,
|
||||||
|
"message_date": msg_date,
|
||||||
|
"message_text": msg_text or "",
|
||||||
|
"file_path": file_path,
|
||||||
|
"original_name": original_name,
|
||||||
|
"extra": {"html_path": str(messages_html)},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
if msg_text and len(msg_text.strip()) >= 500:
|
||||||
|
yield {
|
||||||
|
"origin_type": "message_text",
|
||||||
|
"export_path": str(messages_html.parent),
|
||||||
|
"chat_title": chat_title,
|
||||||
|
"message_id": str(message_id) if message_id else None,
|
||||||
|
"message_date": msg_date,
|
||||||
|
"message_text": msg_text,
|
||||||
|
"file_path": None,
|
||||||
|
"original_name": None,
|
||||||
|
"extra": {"html_path": str(messages_html)},
|
||||||
|
}
|
||||||
73
importers/telegram_json.py
Normal file
73
importers/telegram_json.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterator, List, Optional
|
||||||
|
|
||||||
|
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
||||||
|
|
||||||
|
def find_result_json(root: Path) -> List[Path]:
|
||||||
|
return list(root.rglob("result.json"))
|
||||||
|
|
||||||
|
def _text_field_to_str(text_field) -> str:
|
||||||
|
if isinstance(text_field, str):
|
||||||
|
return text_field
|
||||||
|
if isinstance(text_field, list):
|
||||||
|
parts = []
|
||||||
|
for item in text_field:
|
||||||
|
if isinstance(item, str):
|
||||||
|
parts.append(item)
|
||||||
|
elif isinstance(item, dict) and "text" in item:
|
||||||
|
parts.append(str(item["text"]))
|
||||||
|
return "".join(parts)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def iter_artifacts(result_json: Path) -> Iterator[Dict]:
|
||||||
|
data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore"))
|
||||||
|
|
||||||
|
chats = []
|
||||||
|
if isinstance(data, dict):
|
||||||
|
chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or []
|
||||||
|
for chat in chats:
|
||||||
|
chat_title = chat.get("name") or chat.get("title") or "unknown_chat"
|
||||||
|
messages = chat.get("messages", []) or []
|
||||||
|
for msg in messages:
|
||||||
|
msg_id = str(msg.get("id") or "")
|
||||||
|
msg_date = msg.get("date") or msg.get("date_unixtime") or None
|
||||||
|
text = _text_field_to_str(msg.get("text", ""))
|
||||||
|
|
||||||
|
file_rel = msg.get("file") or None
|
||||||
|
file_path = None
|
||||||
|
original_name = None
|
||||||
|
if file_rel:
|
||||||
|
p = (result_json.parent / file_rel).resolve()
|
||||||
|
if p.exists() and p.suffix.lower() in RESUME_EXTS:
|
||||||
|
file_path = str(p)
|
||||||
|
original_name = p.name
|
||||||
|
|
||||||
|
if file_path:
|
||||||
|
yield {
|
||||||
|
"origin_type": "telegram_json",
|
||||||
|
"export_path": str(result_json.parent),
|
||||||
|
"chat_title": chat_title,
|
||||||
|
"message_id": msg_id,
|
||||||
|
"message_date": str(msg_date) if msg_date is not None else None,
|
||||||
|
"message_text": text or "",
|
||||||
|
"file_path": file_path,
|
||||||
|
"original_name": original_name,
|
||||||
|
"extra": {"json_path": str(result_json)},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# message-only resume paste (heuristic)
|
||||||
|
if text and len(text.strip()) >= 500:
|
||||||
|
yield {
|
||||||
|
"origin_type": "message_text",
|
||||||
|
"export_path": str(result_json.parent),
|
||||||
|
"chat_title": chat_title,
|
||||||
|
"message_id": msg_id,
|
||||||
|
"message_date": str(msg_date) if msg_date is not None else None,
|
||||||
|
"message_text": text,
|
||||||
|
"file_path": None,
|
||||||
|
"original_name": None,
|
||||||
|
"extra": {"json_path": str(result_json)},
|
||||||
|
}
|
||||||
174
normalize.py
Normal file
174
normalize.py
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
_SKILL_SYNONYMS: Dict[str, List[str]] = {
|
||||||
|
"python": ["py"],
|
||||||
|
"javascript": ["js", "node", "nodejs", "java script", "java-script"],
|
||||||
|
"typescript": ["ts", "type script", "type-script"],
|
||||||
|
"postgresql": ["postgres", "psql"],
|
||||||
|
"kubernetes": ["k8s"],
|
||||||
|
"docker": [],
|
||||||
|
"fastapi": [],
|
||||||
|
"django": ["drf", "django rest framework"],
|
||||||
|
"flask": [],
|
||||||
|
"golang": ["go"],
|
||||||
|
"c++": ["cpp"],
|
||||||
|
"c#": ["csharp"],
|
||||||
|
"redis": [],
|
||||||
|
"kafka": [],
|
||||||
|
"rabbitmq": [],
|
||||||
|
"grpc": [],
|
||||||
|
"rest": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
_SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
|
||||||
|
|
||||||
|
_ROLE_SYNONYMS: Dict[str, List[str]] = {
|
||||||
|
"backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
|
||||||
|
"frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
|
||||||
|
"fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
|
||||||
|
"devops": ["sre", "site reliability"],
|
||||||
|
"qa": ["tester", "тестировщик"],
|
||||||
|
"data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
|
||||||
|
"mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
|
||||||
|
alias = {}
|
||||||
|
for canonical, al in src.items():
|
||||||
|
alias[canonical] = canonical
|
||||||
|
for a in al:
|
||||||
|
alias[a] = canonical
|
||||||
|
return {k.lower(): v for k, v in alias.items()}
|
||||||
|
|
||||||
|
|
||||||
|
_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
|
||||||
|
_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_skill_surface(token: str) -> str:
|
||||||
|
t = (token or "").strip().lower()
|
||||||
|
if not t:
|
||||||
|
return ""
|
||||||
|
t = t.replace("/", " ")
|
||||||
|
t = re.sub(r"[_\-]+", " ", t)
|
||||||
|
t = re.sub(r"\s+", " ", t).strip()
|
||||||
|
|
||||||
|
# "java script", "type script", "postgre sql", "graph ql", "g rpc"
|
||||||
|
t = re.sub(r"\bjava\s+script\b", "javascript", t)
|
||||||
|
t = re.sub(r"\btype\s+script\b", "typescript", t)
|
||||||
|
t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
|
||||||
|
t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
|
||||||
|
t = re.sub(r"\bg\s+rpc\b", "grpc", t)
|
||||||
|
t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_skill(token: str) -> Optional[str]:
|
||||||
|
t = _normalize_skill_surface(token)
|
||||||
|
if not t:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Avoid false-positive java from "javascript"
|
||||||
|
if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
|
||||||
|
return "javascript"
|
||||||
|
|
||||||
|
return _SKILL_ALIAS.get(t, t)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_skills(skills: List[str]) -> List[str]:
|
||||||
|
out: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
for s in skills or []:
|
||||||
|
canon = normalize_skill(s)
|
||||||
|
if not canon or canon in seen:
|
||||||
|
continue
|
||||||
|
seen.add(canon)
|
||||||
|
out.append(canon)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_role(token: str) -> Optional[str]:
|
||||||
|
t = (token or "").strip().lower()
|
||||||
|
if not t:
|
||||||
|
return None
|
||||||
|
return _ROLE_ALIAS.get(t, t)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_roles(roles: List[str]) -> List[str]:
|
||||||
|
out: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
for r in roles or []:
|
||||||
|
canon = normalize_role(r)
|
||||||
|
if not canon or canon in seen:
|
||||||
|
continue
|
||||||
|
seen.add(canon)
|
||||||
|
out.append(canon)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def split_skills_primary_secondary(
|
||||||
|
skills: List[str],
|
||||||
|
*,
|
||||||
|
clean_text: str,
|
||||||
|
sections: Dict[str, str] | None = None,
|
||||||
|
primary_limit: int = 25,
|
||||||
|
) -> Tuple[List[str], List[str]]:
|
||||||
|
if not skills:
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
text = (clean_text or "").lower()
|
||||||
|
skills_section = (sections or {}).get("skills", "").lower()
|
||||||
|
experience_section = (sections or {}).get("experience", "").lower()
|
||||||
|
|
||||||
|
scores: Dict[str, float] = {}
|
||||||
|
for sk in skills:
|
||||||
|
s = sk.lower()
|
||||||
|
score = 1.0
|
||||||
|
if s in skills_section:
|
||||||
|
score += 2.2
|
||||||
|
if s in experience_section:
|
||||||
|
score += 1.2
|
||||||
|
count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
|
||||||
|
score += min(2.5, count * 0.5)
|
||||||
|
if s in _SKILL_STOP:
|
||||||
|
score -= 1.5
|
||||||
|
scores[sk] = score
|
||||||
|
|
||||||
|
ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
|
||||||
|
primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
|
||||||
|
secondary = [s for s in ranked if s not in primary]
|
||||||
|
return primary, secondary
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_location(raw: Optional[str]) -> Optional[str]:
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
t = raw.strip()
|
||||||
|
low = t.lower()
|
||||||
|
if low in ("москва", "moscow", "moscow, russia"):
|
||||||
|
return "Moscow, Russia"
|
||||||
|
if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
|
||||||
|
return "Saint Petersburg, Russia"
|
||||||
|
return t
|
||||||
|
|
||||||
|
|
||||||
|
def find_skills_in_text(text: str) -> List[str]:
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
found: List[str] = []
|
||||||
|
seen = set()
|
||||||
|
low = _normalize_skill_surface(text)
|
||||||
|
for alias, canon in _SKILL_ALIAS.items():
|
||||||
|
key = _normalize_skill_surface(alias)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
if re.search(r"\b" + re.escape(key) + r"\b", low):
|
||||||
|
if canon not in seen:
|
||||||
|
found.append(canon)
|
||||||
|
seen.add(canon)
|
||||||
|
return found
|
||||||
45
pdf_merge.py
Normal file
45
pdf_merge.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
|
||||||
|
def merge_pdfs(pdf_paths: Iterable[str | Path], out_pdf_path: str | Path) -> dict:
|
||||||
|
out_pdf_path = Path(out_pdf_path)
|
||||||
|
out_pdf_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
writer = PdfWriter()
|
||||||
|
|
||||||
|
merged: List[str] = []
|
||||||
|
skipped: List[str] = []
|
||||||
|
|
||||||
|
for p in pdf_paths:
|
||||||
|
path = Path(p)
|
||||||
|
try:
|
||||||
|
reader = PdfReader(str(path))
|
||||||
|
# просто добавляем страницы подряд
|
||||||
|
for page in reader.pages:
|
||||||
|
writer.add_page(page)
|
||||||
|
merged.append(str(path))
|
||||||
|
except Exception:
|
||||||
|
skipped.append(str(path))
|
||||||
|
|
||||||
|
if merged:
|
||||||
|
with out_pdf_path.open("wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"out_pdf": str(out_pdf_path),
|
||||||
|
"merged_count": len(merged),
|
||||||
|
"skipped_count": len(skipped),
|
||||||
|
"merged_files": merged,
|
||||||
|
"skipped_files": skipped,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def merge_all_pdfs_in_dir(files_dir: str | Path, out_pdf_path: str | Path) -> dict:
|
||||||
|
files_dir = Path(files_dir)
|
||||||
|
pdfs = sorted(files_dir.rglob("*.pdf")) + sorted(files_dir.rglob("*.PDF"))
|
||||||
|
return merge_pdfs(pdfs, out_pdf_path)
|
||||||
1990
pipeline.py
Normal file
1990
pipeline.py
Normal file
File diff suppressed because it is too large
Load Diff
393
search.py
Normal file
393
search.py
Normal file
@@ -0,0 +1,393 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
|
from tg_resume_db.normalize import normalize_skill, find_skills_in_text
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Normalization helpers
|
||||||
|
# -----------------------------
|
||||||
|
|
||||||
|
def _norm_token(v: str) -> str:
|
||||||
|
return " ".join(str(v).strip().lower().split())
|
||||||
|
|
||||||
|
|
||||||
|
def _as_list(v: Any) -> List[str]:
|
||||||
|
"""
|
||||||
|
Accepts:
|
||||||
|
- None
|
||||||
|
- list
|
||||||
|
- "a,b,c" (csv string)
|
||||||
|
"""
|
||||||
|
if v is None:
|
||||||
|
return []
|
||||||
|
if isinstance(v, list):
|
||||||
|
return [str(x) for x in v if str(x).strip()]
|
||||||
|
s = str(v).strip()
|
||||||
|
if not s:
|
||||||
|
return []
|
||||||
|
return [x.strip() for x in s.split(",") if x.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _uniq_keep_order(xs: List[str]) -> List[str]:
|
||||||
|
seen = set()
|
||||||
|
out: List[str] = []
|
||||||
|
for x in xs:
|
||||||
|
t = _norm_token(x)
|
||||||
|
if not t or t in seen:
|
||||||
|
continue
|
||||||
|
seen.add(t)
|
||||||
|
out.append(t)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Pipe-normalized columns filters
|
||||||
|
# skills_norm / roles_norm like: "|python|fastapi|"
|
||||||
|
# -----------------------------
|
||||||
|
|
||||||
|
def _pipe_any_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
|
||||||
|
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
|
||||||
|
if not vals:
|
||||||
|
return ("1=1", [])
|
||||||
|
|
||||||
|
parts: List[str] = []
|
||||||
|
args: List[Any] = []
|
||||||
|
for v in vals:
|
||||||
|
parts.append(f"instr({field}, ?) > 0")
|
||||||
|
args.append(f"|{v}|")
|
||||||
|
|
||||||
|
return "(" + " OR ".join(parts) + ")", args
|
||||||
|
|
||||||
|
|
||||||
|
def _pipe_all_clause(field: str, values: List[str]) -> Tuple[str, List[Any]]:
|
||||||
|
vals = [_norm_token(x) for x in (values or []) if str(x).strip()]
|
||||||
|
if not vals:
|
||||||
|
return ("1=1", [])
|
||||||
|
|
||||||
|
parts: List[str] = []
|
||||||
|
args: List[Any] = []
|
||||||
|
for v in vals:
|
||||||
|
parts.append(f"instr({field}, ?) > 0")
|
||||||
|
args.append(f"|{v}|")
|
||||||
|
|
||||||
|
return "(" + " AND ".join(parts) + ")", args
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# FTS5 sanitizer (fixes comma/garbage breaking MATCH)
|
||||||
|
# -----------------------------
|
||||||
|
|
||||||
|
# allow longer queries (списки имён, длинные промпты) без агрессивного усечения
|
||||||
|
_FTS_MAX_TERMS = 48
|
||||||
|
|
||||||
|
def _fts_safe_query(q: str) -> str:
|
||||||
|
"""
|
||||||
|
Turn a free-form recruiter text into a safe FTS5 MATCH expression.
|
||||||
|
We intentionally DO NOT allow raw FTS syntax from user input,
|
||||||
|
because it easily breaks on commas/quotes/etc.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
"Backend developer, опыт 5+ лет, Java C++ Python" ->
|
||||||
|
"\"backend\" OR \"developer\" OR \"опыт\" OR \"лет\" OR \"java\" OR \"cpp\" OR \"python\""
|
||||||
|
"""
|
||||||
|
if not q:
|
||||||
|
return "resume"
|
||||||
|
|
||||||
|
s = q.strip().lower()
|
||||||
|
|
||||||
|
# normalize common tokens
|
||||||
|
s = s.replace("c++", "cpp")
|
||||||
|
s = s.replace("c#", "csharp")
|
||||||
|
s = s.replace(".net", "dotnet")
|
||||||
|
|
||||||
|
# remove punctuation that breaks MATCH
|
||||||
|
s = re.sub(r"[,\(\)\[\]\{\};:]+", " ", s)
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
|
||||||
|
# tokens (latin/cyrillic + digits + a few chars)
|
||||||
|
terms = re.findall(r"[a-z0-9а-яё][a-z0-9а-яё._#+-]{1,}", s, flags=re.I)
|
||||||
|
terms = terms[:_FTS_MAX_TERMS]
|
||||||
|
|
||||||
|
if not terms:
|
||||||
|
return "resume"
|
||||||
|
|
||||||
|
# quote every term => safe; join with OR => broad query
|
||||||
|
return " OR ".join([f"\"{t}\"" for t in terms])
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_query_modifiers(q: str) -> Tuple[List[str], List[str], str]:
|
||||||
|
"""
|
||||||
|
Extract +must and -exclude skills from query; return (must, exclude, cleaned_query).
|
||||||
|
"""
|
||||||
|
if not q:
|
||||||
|
return [], [], ""
|
||||||
|
must_raw = re.findall(r"\+([A-Za-z0-9#.+-]{2,})", q)
|
||||||
|
excl_raw = re.findall(r"\-([A-Za-z0-9#.+-]{2,})", q)
|
||||||
|
must = []
|
||||||
|
exclude = []
|
||||||
|
for t in must_raw:
|
||||||
|
canon = normalize_skill(t)
|
||||||
|
if canon:
|
||||||
|
must.append(canon)
|
||||||
|
for t in excl_raw:
|
||||||
|
canon = normalize_skill(t)
|
||||||
|
if canon:
|
||||||
|
exclude.append(canon)
|
||||||
|
if " and " in q.lower() or " & " in q:
|
||||||
|
must += find_skills_in_text(q)
|
||||||
|
|
||||||
|
cleaned = re.sub(r"[+-][A-Za-z0-9#.+-]{2,}", " ", q)
|
||||||
|
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
||||||
|
return _uniq_keep_order(must), _uniq_keep_order(exclude), cleaned
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Contacts
|
||||||
|
# -----------------------------
|
||||||
|
|
||||||
|
def _fetch_contacts_map(con: sqlite3.Connection, candidate_id: str) -> Dict[str, List[str]]:
|
||||||
|
rows = con.execute(
|
||||||
|
"SELECT contact_type, contact_value FROM candidate_contacts WHERE candidate_id=?",
|
||||||
|
(candidate_id,),
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
m: Dict[str, List[str]] = {}
|
||||||
|
for r in rows:
|
||||||
|
m.setdefault(r["contact_type"], []).append(r["contact_value"])
|
||||||
|
|
||||||
|
# чуть чище: уберём дубль-контакты
|
||||||
|
for k, vals in list(m.items()):
|
||||||
|
m[k] = _uniq_keep_order(vals)
|
||||||
|
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Main search (FTS + filters)
|
||||||
|
# -----------------------------
|
||||||
|
|
||||||
|
def search(
|
||||||
|
con: sqlite3.Connection,
|
||||||
|
query: str,
|
||||||
|
filters: Dict[str, Any],
|
||||||
|
limit: int = 20,
|
||||||
|
offset: int = 0,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Search candidates using:
|
||||||
|
- FTS5 for ranking/snippet
|
||||||
|
- stack filters for skills/roles via pipe-normalized columns
|
||||||
|
- basic filters: remote/location/experience/salary/english
|
||||||
|
"""
|
||||||
|
|
||||||
|
where: List[str] = ["r.is_active = 1"]
|
||||||
|
params: List[Any] = []
|
||||||
|
|
||||||
|
must_skills, exclude_skills, cleaned_query = _parse_query_modifiers(query or "")
|
||||||
|
|
||||||
|
# -------- basic filters --------
|
||||||
|
if filters.get("remote") is not None:
|
||||||
|
where.append("c.remote = ?")
|
||||||
|
params.append(1 if bool(filters["remote"]) else 0)
|
||||||
|
|
||||||
|
if filters.get("location"):
|
||||||
|
where.append("c.location IS NOT NULL AND lower(c.location) LIKE ?")
|
||||||
|
params.append("%" + str(filters["location"]).lower() + "%")
|
||||||
|
|
||||||
|
# Используем experience_years для SQL-фильтрации (широкий поиск),
|
||||||
|
# а строгая проверка experience_years_eng будет на этапе пост-фильтрации в agent.py
|
||||||
|
if filters.get("experience_min") is not None:
|
||||||
|
where.append("c.experience_years IS NOT NULL AND c.experience_years >= ?")
|
||||||
|
params.append(float(filters["experience_min"]))
|
||||||
|
|
||||||
|
# Salary: "unknown salary doesn't exclude"
|
||||||
|
if filters.get("salary_min") is not None:
|
||||||
|
where.append("(c.salary_max IS NULL OR c.salary_max >= ?)")
|
||||||
|
params.append(int(filters["salary_min"]))
|
||||||
|
|
||||||
|
if filters.get("salary_max") is not None:
|
||||||
|
where.append("(c.salary_min IS NULL OR c.salary_min <= ?)")
|
||||||
|
params.append(int(filters["salary_max"]))
|
||||||
|
|
||||||
|
if filters.get("doc_type"):
|
||||||
|
where.append("r.doc_type = ?")
|
||||||
|
params.append(str(filters["doc_type"]))
|
||||||
|
|
||||||
|
# English: не фильтруем на уровне SQL (иначе B2 не поймает C1/C2); постфильтр в agent.py
|
||||||
|
|
||||||
|
# -------- roles/skills stack filters --------
|
||||||
|
# backward compatibility
|
||||||
|
skills_any: List[str] = []
|
||||||
|
skills_all: List[str] = []
|
||||||
|
roles_any: List[str] = []
|
||||||
|
|
||||||
|
if filters.get("skill"):
|
||||||
|
skills_any.append(str(filters["skill"]))
|
||||||
|
if filters.get("role"):
|
||||||
|
roles_any.append(str(filters["role"]))
|
||||||
|
|
||||||
|
skills_any += _as_list(filters.get("skills_any"))
|
||||||
|
skills_all += _as_list(filters.get("skills_all"))
|
||||||
|
roles_any += _as_list(filters.get("roles_any"))
|
||||||
|
|
||||||
|
skills_any = _uniq_keep_order([normalize_skill(s) or s for s in skills_any])
|
||||||
|
skills_all = _uniq_keep_order([normalize_skill(s) or s for s in skills_all])
|
||||||
|
roles_any = _uniq_keep_order(roles_any)
|
||||||
|
|
||||||
|
if must_skills:
|
||||||
|
skills_all = _uniq_keep_order(skills_all + must_skills)
|
||||||
|
|
||||||
|
# Denis rule: if any skills were provided -> enforce ANY match
|
||||||
|
if skills_any:
|
||||||
|
clause, args = _pipe_any_clause("c.skills_norm", skills_any)
|
||||||
|
where.append(clause)
|
||||||
|
params.extend(args)
|
||||||
|
|
||||||
|
if skills_all:
|
||||||
|
clause, args = _pipe_all_clause("c.skills_norm", skills_all)
|
||||||
|
where.append(clause)
|
||||||
|
params.extend(args)
|
||||||
|
|
||||||
|
if roles_any:
|
||||||
|
clause, args = _pipe_any_clause("c.roles_norm", roles_any)
|
||||||
|
where.append(clause)
|
||||||
|
params.extend(args)
|
||||||
|
|
||||||
|
if exclude_skills:
|
||||||
|
for sk in exclude_skills:
|
||||||
|
where.append("instr(c.skills_norm, ?) = 0")
|
||||||
|
params.append(f"|{sk}|")
|
||||||
|
|
||||||
|
# -------- FTS query (SAFE) --------
|
||||||
|
fts_q = _fts_safe_query(cleaned_query or "")
|
||||||
|
|
||||||
|
limit = max(1, min(int(limit or 20), 100))
|
||||||
|
offset = max(0, int(offset or 0))
|
||||||
|
|
||||||
|
# UPDATED SQL: Added experience_years_eng and language/backend metadata
|
||||||
|
sql = f"""
|
||||||
|
SELECT
|
||||||
|
c.candidate_id,
|
||||||
|
c.name,
|
||||||
|
c.location,
|
||||||
|
c.remote,
|
||||||
|
c.experience_years,
|
||||||
|
c.experience_years_eng,
|
||||||
|
c.experience_confidence,
|
||||||
|
c.salary_min,
|
||||||
|
c.salary_max,
|
||||||
|
c.salary_confidence,
|
||||||
|
c.english_level,
|
||||||
|
c.roles_json,
|
||||||
|
c.skills_json,
|
||||||
|
c.primary_languages_json,
|
||||||
|
c.backend_focus,
|
||||||
|
r.doc_type,
|
||||||
|
r.doc_type_confidence,
|
||||||
|
r.parse_method,
|
||||||
|
r.resume_id,
|
||||||
|
snippet(resumes_fts, 2, '[', ']', '…', 14) AS snippet,
|
||||||
|
bm25(resumes_fts) AS rank
|
||||||
|
FROM resumes_fts
|
||||||
|
JOIN resumes r ON r.resume_id = resumes_fts.resume_id
|
||||||
|
JOIN candidates c ON c.candidate_id = resumes_fts.candidate_id
|
||||||
|
WHERE resumes_fts MATCH ? AND {" AND ".join(where)}
|
||||||
|
ORDER BY rank
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
"""
|
||||||
|
|
||||||
|
rows = con.execute(sql, [fts_q] + params + [limit, offset]).fetchall()
|
||||||
|
|
||||||
|
out: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
cand_id = row["candidate_id"]
|
||||||
|
contacts_map = _fetch_contacts_map(con, cand_id)
|
||||||
|
|
||||||
|
out.append(
|
||||||
|
{
|
||||||
|
"candidate_id": cand_id,
|
||||||
|
"name": row["name"],
|
||||||
|
"location": row["location"],
|
||||||
|
"remote": bool(row["remote"]) if row["remote"] is not None else None,
|
||||||
|
"experience_years": row["experience_years"],
|
||||||
|
"experience_years_eng": row["experience_years_eng"], # Passed to agent
|
||||||
|
"experience_confidence": row["experience_confidence"],
|
||||||
|
"salary_min": row["salary_min"],
|
||||||
|
"salary_max": row["salary_max"],
|
||||||
|
"salary_confidence": row["salary_confidence"],
|
||||||
|
"english_level": row["english_level"],
|
||||||
|
"roles": json.loads(row["roles_json"] or "[]"),
|
||||||
|
"skills": json.loads(row["skills_json"] or "[]"),
|
||||||
|
"primary_languages": json.loads(row["primary_languages_json"] or "[]"),
|
||||||
|
"backend_focus": (bool(row["backend_focus"]) if row["backend_focus"] is not None else None),
|
||||||
|
"doc_type": row["doc_type"],
|
||||||
|
"doc_type_confidence": row["doc_type_confidence"],
|
||||||
|
"parse_method": row["parse_method"],
|
||||||
|
"contacts": contacts_map,
|
||||||
|
"resume_id": row["resume_id"],
|
||||||
|
"snippet": row["snippet"],
|
||||||
|
"rank": row["rank"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Agent helper (SearchPlan -> search())
|
||||||
|
# -----------------------------
|
||||||
|
|
||||||
|
def _join_csv(xs: List[str]) -> str:
|
||||||
|
xs = [str(x).strip() for x in (xs or []) if str(x).strip()]
|
||||||
|
return ",".join(xs)
|
||||||
|
|
||||||
|
|
||||||
|
def search_with_filters(con: sqlite3.Connection, plan: Any) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Wrapper for agent.py.
|
||||||
|
Expects `plan` with fields:
|
||||||
|
query_text, skills_any, skills_all, roles_any, location, remote,
|
||||||
|
english_min, exp_years_min, salary_min, salary_max, limit, sort
|
||||||
|
Returns:
|
||||||
|
{ "items": [...], "count": N }
|
||||||
|
"""
|
||||||
|
filters = {
|
||||||
|
"remote": getattr(plan, "remote", None),
|
||||||
|
"location": getattr(plan, "location", None),
|
||||||
|
"experience_min": getattr(plan, "exp_years_min", None),
|
||||||
|
"salary_min": getattr(plan, "salary_min", None),
|
||||||
|
"salary_max": getattr(plan, "salary_max", None),
|
||||||
|
"english": getattr(plan, "english_min", None),
|
||||||
|
"roles_any": _join_csv(getattr(plan, "roles_any", []) or []),
|
||||||
|
"skills_any": _join_csv(getattr(plan, "skills_any", []) or []),
|
||||||
|
"skills_all": _join_csv(getattr(plan, "skills_all", []) or []),
|
||||||
|
}
|
||||||
|
|
||||||
|
items = search(
|
||||||
|
con,
|
||||||
|
query=(getattr(plan, "query_text", "") or "").strip(),
|
||||||
|
filters=filters,
|
||||||
|
limit=int(getattr(plan, "limit", 20) or 20),
|
||||||
|
offset=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
sort_mode = (getattr(plan, "sort", "rank") or "rank").strip()
|
||||||
|
|
||||||
|
if sort_mode == "exp_desc":
|
||||||
|
def k(it: Dict[str, Any]):
|
||||||
|
v = it.get("experience_years")
|
||||||
|
return (v is None, -(v or 0.0))
|
||||||
|
items = sorted(items, key=k)
|
||||||
|
|
||||||
|
elif sort_mode == "salary_desc":
|
||||||
|
def k(it: Dict[str, Any]):
|
||||||
|
v = it.get("salary_max") if it.get("salary_max") is not None else it.get("salary_min")
|
||||||
|
return (v is None, -(v or 0))
|
||||||
|
items = sorted(items, key=k)
|
||||||
|
|
||||||
|
return {"items": items, "count": len(items)}
|
||||||
33
util.py
Normal file
33
util.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
def utc_iso() -> str:
|
||||||
|
return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
|
||||||
|
|
||||||
|
class Logger:
|
||||||
|
def __init__(self, log_path: Optional[str] = None):
|
||||||
|
self.log_path = Path(log_path) if log_path else None
|
||||||
|
if self.log_path:
|
||||||
|
self.log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _write(self, level: str, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
|
||||||
|
line = f"{utc_iso()} [{level}] {msg}"
|
||||||
|
print(line, file=sys.stdout, flush=True)
|
||||||
|
if self.log_path:
|
||||||
|
payload = {"ts": utc_iso(), "level": level, "msg": msg, "extra": extra or {}}
|
||||||
|
with self.log_path.open("a", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
def info(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
|
||||||
|
self._write("INFO", msg, extra)
|
||||||
|
|
||||||
|
def warn(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
|
||||||
|
self._write("WARN", msg, extra)
|
||||||
|
|
||||||
|
def error(self, msg: str, extra: Optional[Dict[str, Any]] = None) -> None:
|
||||||
|
self._write("ERROR", msg, extra)
|
||||||
Reference in New Issue
Block a user