Files
tg_resume_db/bundle_export.py
2026-03-11 15:27:10 +03:00

268 lines
8.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import json
import os
import re
import shutil
import sqlite3
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional
# NEW: PDF merge helper (pypdf)
# pip install pypdf
try:
from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили
merge_all_pdfs_in_dir = None
def _slug(s: str, max_len: int = 60) -> str:
s = (s or "").strip()
if not s:
return "candidate"
s = re.sub(r"\s+", " ", s)
s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
s = s.replace(" ", "_")
s = re.sub(r"_+", "_", s).strip("_")
if not s:
return "candidate"
return s[:max_len]
def _safe_mkdir(p: Path) -> None:
p.mkdir(parents=True, exist_ok=True)
def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
"""
Возвращает список самых приоритетных путей к файлу резюме.
1) resumes.file_path
2) sources.original_file_path
3) некоторые варианты путей из sources.extra_json
"""
paths: List[str] = []
row = con.execute(
"SELECT file_path FROM resumes WHERE resume_id=?",
(resume_id,),
).fetchone()
if row and row["file_path"]:
paths.append(str(row["file_path"]))
cur = con.execute(
"""SELECT original_file_path, original_file_name, extra_json
FROM sources
WHERE resume_id=?""",
(resume_id,),
)
for r in cur.fetchall():
ofp = r["original_file_path"]
if ofp:
paths.append(str(ofp))
try:
extra = json.loads(r["extra_json"] or "{}")
if isinstance(extra, dict):
for k in ("file_path", "path", "local_path", "source_path"):
if extra.get(k):
paths.append(str(extra[k]))
except Exception:
pass
# дедуп
seen = set()
out: List[str] = []
for p in paths:
p2 = os.path.normpath(p)
if p2 in seen:
continue
seen.add(p2)
out.append(p2)
return out
def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
ext = src.suffix.lower() if src.suffix else ""
candidate = f"{base_name}{ext}"
dst = dst_dir / candidate
if dst.exists():
i = 2
while True:
dst = dst_dir / f"{base_name}({i}){ext}"
if not dst.exists():
break
i += 1
shutil.copy2(src, dst)
return dst
def bundle_search_results(
con: sqlite3.Connection,
results: Iterable[Dict[str, Any]],
out_dir: str,
*,
copy_files: bool = True,
merge_text: bool = True,
merge_pdf: bool = True, # NEW
) -> Dict[str, Any]:
"""
results: iterable dictов где есть минимум:
- resume_id
- candidate_id
- name (желательно)
Создаёт:
- files/: скопированные исходные файлы резюме
- merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
- pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
- manifest.json
- README.txt
"""
out_root = Path(out_dir).resolve()
files_dir = out_root / "files"
_safe_mkdir(files_dir)
manifest: List[Dict[str, Any]] = []
copied = 0
missing = 0
merged_parts: List[str] = []
merged_txt_path = out_root / "merged_resumes.txt"
for item in results:
resume_id = item.get("resume_id")
cand_id = item.get("candidate_id")
name = item.get("name") or ""
if not resume_id or not cand_id:
continue
# merged TXT из БД
if merge_text:
row = con.execute(
"SELECT clean_text FROM resumes WHERE resume_id=?",
(resume_id,),
).fetchone()
clean_text = (row["clean_text"] if row else "") or ""
header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
merged_parts.append(header)
merged_parts.append(clean_text.strip())
merged_parts.append("")
if not copy_files:
continue
src_paths = _pick_source_paths(con, resume_id)
src_found: Optional[Path] = None
for sp in src_paths:
p = Path(sp)
if p.exists() and p.is_file():
src_found = p
break
if not src_found:
missing += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": False,
"reason": "source_file_not_found",
"tried_paths": src_paths,
}
)
continue
base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
try:
dst = _copy_unique(src_found, files_dir, base)
copied += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": True,
"source_path": str(src_found),
"dest_path": str(dst),
}
)
except Exception as e:
missing += 1
manifest.append(
{
"candidate_id": cand_id,
"name": name,
"resume_id": resume_id,
"copied": False,
"reason": f"copy_failed: {repr(e)}",
"source_path": str(src_found),
}
)
# merged TXT
if merge_text:
merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
# NEW: merged PDF from files/*.pdf
merged_pdf_path: Optional[Path] = None
pdf_info: Optional[Dict[str, Any]] = None
if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
try:
merged_pdf_path = out_root / "pdf" / "merged.pdf"
_safe_mkdir(merged_pdf_path.parent)
pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
except Exception as e:
pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
# manifest.json
(out_root / "manifest.json").write_text(
json.dumps(
{
"out_dir": str(out_root),
"copied_files": copied,
"missing_files": missing,
"merged_text": str(merged_txt_path) if merge_text else None,
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
"pdf_info": pdf_info,
"items": manifest,
},
ensure_ascii=False,
indent=2,
),
encoding="utf-8",
errors="ignore",
)
# README
readme_lines = [
"Папка создана командой search.",
"- files/: скопированные исходные файлы резюме",
"- merged_resumes.txt: склейка текста clean_text из БД",
"- manifest.json: что откуда скопировалось / что не найдено",
]
if merge_pdf:
if merge_all_pdfs_in_dir is None:
readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
else:
readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
(out_root / "README.txt").write_text(
"\n".join(readme_lines) + "\n",
encoding="utf-8",
errors="ignore",
)
return {
"out_dir": str(out_root),
"copied_files": copied,
"missing_files": missing,
"merged_text": str(merged_txt_path) if merge_text else None,
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
"manifest": str(out_root / "manifest.json"),
"pdf_info": pdf_info,
}