Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

267
bundle_export.py Normal file
View File

@@ -0,0 +1,267 @@
from __future__ import annotations
import json
import os
import re
import shutil
import sqlite3
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional
# NEW: PDF merge helper (pypdf)
# pip install pypdf
try:
from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили
merge_all_pdfs_in_dir = None
def _slug(s: str, max_len: int = 60) -> str:
s = (s or "").strip()
if not s:
return "candidate"
s = re.sub(r"\s+", " ", s)
s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
s = s.replace(" ", "_")
s = re.sub(r"_+", "_", s).strip("_")
if not s:
return "candidate"
return s[:max_len]
def _safe_mkdir(p: Path) -> None:
p.mkdir(parents=True, exist_ok=True)
def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
"""
Возвращает список самых приоритетных путей к файлу резюме.
1) resumes.file_path
2) sources.original_file_path
3) некоторые варианты путей из sources.extra_json
"""
paths: List[str] = []
row = con.execute(
"SELECT file_path FROM resumes WHERE resume_id=?",
(resume_id,),
).fetchone()
if row and row["file_path"]:
paths.append(str(row["file_path"]))
cur = con.execute(
"""SELECT original_file_path, original_file_name, extra_json
FROM sources
WHERE resume_id=?""",
(resume_id,),
)
for r in cur.fetchall():
ofp = r["original_file_path"]
if ofp:
paths.append(str(ofp))
try:
extra = json.loads(r["extra_json"] or "{}")
if isinstance(extra, dict):
for k in ("file_path", "path", "local_path", "source_path"):
if extra.get(k):
paths.append(str(extra[k]))
except Exception:
pass
# дедуп
seen = set()
out: List[str] = []
for p in paths:
p2 = os.path.normpath(p)
if p2 in seen:
continue
seen.add(p2)
out.append(p2)
return out
def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
ext = src.suffix.lower() if src.suffix else ""
candidate = f"{base_name}{ext}"
dst = dst_dir / candidate
if dst.exists():
i = 2
while True:
dst = dst_dir / f"{base_name}({i}){ext}"
if not dst.exists():
break
i += 1
shutil.copy2(src, dst)
return dst
def bundle_search_results(
con: sqlite3.Connection,
results: Iterable[Dict[str, Any]],
out_dir: str,
*,
copy_files: bool = True,
merge_text: bool = True,
merge_pdf: bool = True, # NEW
) -> Dict[str, Any]:
"""
results: iterable dictов где есть минимум:
- resume_id
- candidate_id
- name (желательно)
Создаёт:
- files/: скопированные исходные файлы резюме
- merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
- pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
- manifest.json
- README.txt
"""
out_root = Path(out_dir).resolve()
files_dir = out_root / "files"
_safe_mkdir(files_dir)
manifest: List[Dict[str, Any]] = []
copied = 0
missing = 0
merged_parts: List[str] = []
merged_txt_path = out_root / "merged_resumes.txt"
for item in results:
resume_id = item.get("resume_id")
cand_id = item.get("candidate_id")
name = item.get("name") or ""
if not resume_id or not cand_id:
continue
# merged TXT из БД
if merge_text:
row = con.execute(
"SELECT clean_text FROM resumes WHERE resume_id=?",
(resume_id,),
).fetchone()
clean_text = (row["clean_text"] if row else "") or ""
header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
merged_parts.append(header)
merged_parts.append(clean_text.strip())
merged_parts.append("")
if not copy_files:
continue
src_paths = _pick_source_paths(con, resume_id)
src_found: Optional[Path] = None
for sp in src_paths:
p = Path(sp)
if p.exists() and p.is_file():
src_found = p
break
if not src_found:
missing += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": False,
"reason": "source_file_not_found",
"tried_paths": src_paths,
}
)
continue
base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
try:
dst = _copy_unique(src_found, files_dir, base)
copied += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": True,
"source_path": str(src_found),
"dest_path": str(dst),
}
)
except Exception as e:
missing += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": False,
"reason": f"copy_failed: {repr(e)}",
"source_path": str(src_found),
}
)
# merged TXT
if merge_text:
merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
# NEW: merged PDF from files/*.pdf
merged_pdf_path: Optional[Path] = None
pdf_info: Optional[Dict[str, Any]] = None
if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
try:
merged_pdf_path = out_root / "pdf" / "merged.pdf"
_safe_mkdir(merged_pdf_path.parent)
pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
except Exception as e:
pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
# manifest.json
(out_root / "manifest.json").write_text(
json.dumps(
{
"out_dir": str(out_root),
"copied_files": copied,
"missing_files": missing,
"merged_text": str(merged_txt_path) if merge_text else None,
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
"pdf_info": pdf_info,
"items": manifest,
},
ensure_ascii=False,
indent=2,
),
encoding="utf-8",
errors="ignore",
)
# README
readme_lines = [
"Папка создана командой search.",
"- files/: скопированные исходные файлы резюме",
"- merged_resumes.txt: склейка текста clean_text из БД",
"- manifest.json: что откуда скопировалось / что не найдено",
]
if merge_pdf:
if merge_all_pdfs_in_dir is None:
readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
else:
readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
(out_root / "README.txt").write_text(
"\n".join(readme_lines) + "\n",
encoding="utf-8",
errors="ignore",
)
return {
"out_dir": str(out_root),
"copied_files": copied,
"missing_files": missing,
"merged_text": str(merged_txt_path) if merge_text else None,
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
"manifest": str(out_root / "manifest.json"),
"pdf_info": pdf_info,
}