from __future__ import annotations import json import os import re import shutil import sqlite3 from pathlib import Path from typing import Any, Dict, Iterable, List, Optional # NEW: PDF merge helper (pypdf) # pip install pypdf try: from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили merge_all_pdfs_in_dir = None def _slug(s: str, max_len: int = 60) -> str: s = (s or "").strip() if not s: return "candidate" s = re.sub(r"\s+", " ", s) s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s) s = s.replace(" ", "_") s = re.sub(r"_+", "_", s).strip("_") if not s: return "candidate" return s[:max_len] def _safe_mkdir(p: Path) -> None: p.mkdir(parents=True, exist_ok=True) def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]: """ Возвращает список самых приоритетных путей к файлу резюме. 1) resumes.file_path 2) sources.original_file_path 3) некоторые варианты путей из sources.extra_json """ paths: List[str] = [] row = con.execute( "SELECT file_path FROM resumes WHERE resume_id=?", (resume_id,), ).fetchone() if row and row["file_path"]: paths.append(str(row["file_path"])) cur = con.execute( """SELECT original_file_path, original_file_name, extra_json FROM sources WHERE resume_id=?""", (resume_id,), ) for r in cur.fetchall(): ofp = r["original_file_path"] if ofp: paths.append(str(ofp)) try: extra = json.loads(r["extra_json"] or "{}") if isinstance(extra, dict): for k in ("file_path", "path", "local_path", "source_path"): if extra.get(k): paths.append(str(extra[k])) except Exception: pass # дедуп seen = set() out: List[str] = [] for p in paths: p2 = os.path.normpath(p) if p2 in seen: continue seen.add(p2) out.append(p2) return out def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path: ext = src.suffix.lower() if src.suffix else "" candidate = f"{base_name}{ext}" dst = dst_dir / candidate if dst.exists(): i = 2 while True: dst = dst_dir / f"{base_name}({i}){ext}" if not dst.exists(): break i += 1 shutil.copy2(src, dst) return dst def bundle_search_results( con: sqlite3.Connection, results: Iterable[Dict[str, Any]], out_dir: str, *, copy_files: bool = True, merge_text: bool = True, merge_pdf: bool = True, # NEW ) -> Dict[str, Any]: """ results: iterable dictов где есть минимум: - resume_id - candidate_id - name (желательно) Создаёт: - files/: скопированные исходные файлы резюме - merged_resumes.txt: склейка текста clean_text из БД (если merge_text) - pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен) - manifest.json - README.txt """ out_root = Path(out_dir).resolve() files_dir = out_root / "files" _safe_mkdir(files_dir) manifest: List[Dict[str, Any]] = [] copied = 0 missing = 0 merged_parts: List[str] = [] merged_txt_path = out_root / "merged_resumes.txt" for item in results: resume_id = item.get("resume_id") cand_id = item.get("candidate_id") name = item.get("name") or "" if not resume_id or not cand_id: continue # merged TXT из БД if merge_text: row = con.execute( "SELECT clean_text FROM resumes WHERE resume_id=?", (resume_id,), ).fetchone() clean_text = (row["clean_text"] if row else "") or "" header = f"===== {name or cand_id} | {cand_id} | {resume_id} =====" merged_parts.append(header) merged_parts.append(clean_text.strip()) merged_parts.append("") if not copy_files: continue src_paths = _pick_source_paths(con, resume_id) src_found: Optional[Path] = None for sp in src_paths: p = Path(sp) if p.exists() and p.is_file(): src_found = p break if not src_found: missing += 1 manifest.append( { "candidate_id": cand_id, "name": name, "resume_id": resume_id, "copied": False, "reason": "source_file_not_found", "tried_paths": src_paths, } ) continue base = f"{_slug(name) or _slug(cand_id)}__{resume_id}" try: dst = _copy_unique(src_found, files_dir, base) copied += 1 manifest.append( { "candidate_id": cand_id, "name": name, "resume_id": resume_id, "copied": True, "source_path": str(src_found), "dest_path": str(dst), } ) except Exception as e: missing += 1 manifest.append( { "candidate_id": cand_id, "name": name, "resume_id": resume_id, "copied": False, "reason": f"copy_failed: {repr(e)}", "source_path": str(src_found), } ) # merged TXT if merge_text: merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore") # NEW: merged PDF from files/*.pdf merged_pdf_path: Optional[Path] = None pdf_info: Optional[Dict[str, Any]] = None if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None: try: merged_pdf_path = out_root / "pdf" / "merged.pdf" _safe_mkdir(merged_pdf_path.parent) pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path) except Exception as e: pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"} # manifest.json (out_root / "manifest.json").write_text( json.dumps( { "out_dir": str(out_root), "copied_files": copied, "missing_files": missing, "merged_text": str(merged_txt_path) if merge_text else None, "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None, "pdf_info": pdf_info, "items": manifest, }, ensure_ascii=False, indent=2, ), encoding="utf-8", errors="ignore", ) # README readme_lines = [ "Папка создана командой search.", "- files/: скопированные исходные файлы резюме", "- merged_resumes.txt: склейка текста clean_text из БД", "- manifest.json: что откуда скопировалось / что не найдено", ] if merge_pdf: if merge_all_pdfs_in_dir is None: readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)") else: readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)") (out_root / "README.txt").write_text( "\n".join(readme_lines) + "\n", encoding="utf-8", errors="ignore", ) return { "out_dir": str(out_root), "copied_files": copied, "missing_files": missing, "merged_text": str(merged_txt_path) if merge_text else None, "merged_pdf": str(merged_pdf_path) if merged_pdf_path else None, "manifest": str(out_root / "manifest.json"), "pdf_info": pdf_info, }