268 lines
8.2 KiB
Python
268 lines
8.2 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import shutil
|
||
import sqlite3
|
||
from pathlib import Path
|
||
from typing import Any, Dict, Iterable, List, Optional
|
||
|
||
# NEW: PDF merge helper (pypdf)
|
||
# pip install pypdf
|
||
try:
|
||
from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
|
||
except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили
|
||
merge_all_pdfs_in_dir = None
|
||
|
||
|
||
def _slug(s: str, max_len: int = 60) -> str:
|
||
s = (s or "").strip()
|
||
if not s:
|
||
return "candidate"
|
||
s = re.sub(r"\s+", " ", s)
|
||
s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
|
||
s = s.replace(" ", "_")
|
||
s = re.sub(r"_+", "_", s).strip("_")
|
||
if not s:
|
||
return "candidate"
|
||
return s[:max_len]
|
||
|
||
|
||
def _safe_mkdir(p: Path) -> None:
|
||
p.mkdir(parents=True, exist_ok=True)
|
||
|
||
|
||
def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
|
||
"""
|
||
Возвращает список самых приоритетных путей к файлу резюме.
|
||
1) resumes.file_path
|
||
2) sources.original_file_path
|
||
3) некоторые варианты путей из sources.extra_json
|
||
"""
|
||
paths: List[str] = []
|
||
|
||
row = con.execute(
|
||
"SELECT file_path FROM resumes WHERE resume_id=?",
|
||
(resume_id,),
|
||
).fetchone()
|
||
if row and row["file_path"]:
|
||
paths.append(str(row["file_path"]))
|
||
|
||
cur = con.execute(
|
||
"""SELECT original_file_path, original_file_name, extra_json
|
||
FROM sources
|
||
WHERE resume_id=?""",
|
||
(resume_id,),
|
||
)
|
||
for r in cur.fetchall():
|
||
ofp = r["original_file_path"]
|
||
if ofp:
|
||
paths.append(str(ofp))
|
||
|
||
try:
|
||
extra = json.loads(r["extra_json"] or "{}")
|
||
if isinstance(extra, dict):
|
||
for k in ("file_path", "path", "local_path", "source_path"):
|
||
if extra.get(k):
|
||
paths.append(str(extra[k]))
|
||
except Exception:
|
||
pass
|
||
|
||
# дедуп
|
||
seen = set()
|
||
out: List[str] = []
|
||
for p in paths:
|
||
p2 = os.path.normpath(p)
|
||
if p2 in seen:
|
||
continue
|
||
seen.add(p2)
|
||
out.append(p2)
|
||
return out
|
||
|
||
|
||
def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
|
||
ext = src.suffix.lower() if src.suffix else ""
|
||
candidate = f"{base_name}{ext}"
|
||
dst = dst_dir / candidate
|
||
|
||
if dst.exists():
|
||
i = 2
|
||
while True:
|
||
dst = dst_dir / f"{base_name}({i}){ext}"
|
||
if not dst.exists():
|
||
break
|
||
i += 1
|
||
|
||
shutil.copy2(src, dst)
|
||
return dst
|
||
|
||
|
||
def bundle_search_results(
|
||
con: sqlite3.Connection,
|
||
results: Iterable[Dict[str, Any]],
|
||
out_dir: str,
|
||
*,
|
||
copy_files: bool = True,
|
||
merge_text: bool = True,
|
||
merge_pdf: bool = True, # NEW
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
results: iterable dictов где есть минимум:
|
||
- resume_id
|
||
- candidate_id
|
||
- name (желательно)
|
||
|
||
Создаёт:
|
||
- files/: скопированные исходные файлы резюме
|
||
- merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
|
||
- pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
|
||
- manifest.json
|
||
- README.txt
|
||
"""
|
||
out_root = Path(out_dir).resolve()
|
||
files_dir = out_root / "files"
|
||
_safe_mkdir(files_dir)
|
||
|
||
manifest: List[Dict[str, Any]] = []
|
||
copied = 0
|
||
missing = 0
|
||
|
||
merged_parts: List[str] = []
|
||
merged_txt_path = out_root / "merged_resumes.txt"
|
||
|
||
for item in results:
|
||
resume_id = item.get("resume_id")
|
||
cand_id = item.get("candidate_id")
|
||
name = item.get("name") or ""
|
||
if not resume_id or not cand_id:
|
||
continue
|
||
|
||
# merged TXT из БД
|
||
if merge_text:
|
||
row = con.execute(
|
||
"SELECT clean_text FROM resumes WHERE resume_id=?",
|
||
(resume_id,),
|
||
).fetchone()
|
||
clean_text = (row["clean_text"] if row else "") or ""
|
||
header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
|
||
merged_parts.append(header)
|
||
merged_parts.append(clean_text.strip())
|
||
merged_parts.append("")
|
||
|
||
if not copy_files:
|
||
continue
|
||
|
||
src_paths = _pick_source_paths(con, resume_id)
|
||
|
||
src_found: Optional[Path] = None
|
||
for sp in src_paths:
|
||
p = Path(sp)
|
||
if p.exists() and p.is_file():
|
||
src_found = p
|
||
break
|
||
|
||
if not src_found:
|
||
missing += 1
|
||
manifest.append(
|
||
{
|
||
"candidate_id": cand_id,
|
||
"name": name,
|
||
"resume_id": resume_id,
|
||
"copied": False,
|
||
"reason": "source_file_not_found",
|
||
"tried_paths": src_paths,
|
||
}
|
||
)
|
||
continue
|
||
|
||
base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
|
||
try:
|
||
dst = _copy_unique(src_found, files_dir, base)
|
||
copied += 1
|
||
manifest.append(
|
||
{
|
||
"candidate_id": cand_id,
|
||
"name": name,
|
||
"resume_id": resume_id,
|
||
"copied": True,
|
||
"source_path": str(src_found),
|
||
"dest_path": str(dst),
|
||
}
|
||
)
|
||
except Exception as e:
|
||
missing += 1
|
||
manifest.append(
|
||
{
|
||
"candidate_id": cand_id,
|
||
"name": name,
|
||
"resume_id": resume_id,
|
||
"copied": False,
|
||
"reason": f"copy_failed: {repr(e)}",
|
||
"source_path": str(src_found),
|
||
}
|
||
)
|
||
|
||
# merged TXT
|
||
if merge_text:
|
||
merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
|
||
|
||
# NEW: merged PDF from files/*.pdf
|
||
merged_pdf_path: Optional[Path] = None
|
||
pdf_info: Optional[Dict[str, Any]] = None
|
||
if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
|
||
try:
|
||
merged_pdf_path = out_root / "pdf" / "merged.pdf"
|
||
_safe_mkdir(merged_pdf_path.parent)
|
||
pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
|
||
except Exception as e:
|
||
pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
|
||
|
||
# manifest.json
|
||
(out_root / "manifest.json").write_text(
|
||
json.dumps(
|
||
{
|
||
"out_dir": str(out_root),
|
||
"copied_files": copied,
|
||
"missing_files": missing,
|
||
"merged_text": str(merged_txt_path) if merge_text else None,
|
||
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
|
||
"pdf_info": pdf_info,
|
||
"items": manifest,
|
||
},
|
||
ensure_ascii=False,
|
||
indent=2,
|
||
),
|
||
encoding="utf-8",
|
||
errors="ignore",
|
||
)
|
||
|
||
# README
|
||
readme_lines = [
|
||
"Папка создана командой search.",
|
||
"- files/: скопированные исходные файлы резюме",
|
||
"- merged_resumes.txt: склейка текста clean_text из БД",
|
||
"- manifest.json: что откуда скопировалось / что не найдено",
|
||
]
|
||
if merge_pdf:
|
||
if merge_all_pdfs_in_dir is None:
|
||
readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
|
||
else:
|
||
readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
|
||
|
||
(out_root / "README.txt").write_text(
|
||
"\n".join(readme_lines) + "\n",
|
||
encoding="utf-8",
|
||
errors="ignore",
|
||
)
|
||
|
||
return {
|
||
"out_dir": str(out_root),
|
||
"copied_files": copied,
|
||
"missing_files": missing,
|
||
"merged_text": str(merged_txt_path) if merge_text else None,
|
||
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
|
||
"manifest": str(out_root / "manifest.json"),
|
||
"pdf_info": pdf_info,
|
||
}
|