Initial commit
This commit is contained in:
267
bundle_export.py
Normal file
267
bundle_export.py
Normal file
@@ -0,0 +1,267 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
# NEW: PDF merge helper (pypdf)
|
||||
# pip install pypdf
|
||||
try:
|
||||
from tg_resume_db.pdf_merge import merge_all_pdfs_in_dir
|
||||
except Exception: # чтобы не ломать bundle, если pypdf/модуль не поставили
|
||||
merge_all_pdfs_in_dir = None
|
||||
|
||||
|
||||
def _slug(s: str, max_len: int = 60) -> str:
|
||||
s = (s or "").strip()
|
||||
if not s:
|
||||
return "candidate"
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
s = re.sub(r"[^0-9A-Za-zА-Яа-я_\- ]+", "_", s)
|
||||
s = s.replace(" ", "_")
|
||||
s = re.sub(r"_+", "_", s).strip("_")
|
||||
if not s:
|
||||
return "candidate"
|
||||
return s[:max_len]
|
||||
|
||||
|
||||
def _safe_mkdir(p: Path) -> None:
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _pick_source_paths(con: sqlite3.Connection, resume_id: str) -> List[str]:
|
||||
"""
|
||||
Возвращает список самых приоритетных путей к файлу резюме.
|
||||
1) resumes.file_path
|
||||
2) sources.original_file_path
|
||||
3) некоторые варианты путей из sources.extra_json
|
||||
"""
|
||||
paths: List[str] = []
|
||||
|
||||
row = con.execute(
|
||||
"SELECT file_path FROM resumes WHERE resume_id=?",
|
||||
(resume_id,),
|
||||
).fetchone()
|
||||
if row and row["file_path"]:
|
||||
paths.append(str(row["file_path"]))
|
||||
|
||||
cur = con.execute(
|
||||
"""SELECT original_file_path, original_file_name, extra_json
|
||||
FROM sources
|
||||
WHERE resume_id=?""",
|
||||
(resume_id,),
|
||||
)
|
||||
for r in cur.fetchall():
|
||||
ofp = r["original_file_path"]
|
||||
if ofp:
|
||||
paths.append(str(ofp))
|
||||
|
||||
try:
|
||||
extra = json.loads(r["extra_json"] or "{}")
|
||||
if isinstance(extra, dict):
|
||||
for k in ("file_path", "path", "local_path", "source_path"):
|
||||
if extra.get(k):
|
||||
paths.append(str(extra[k]))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# дедуп
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for p in paths:
|
||||
p2 = os.path.normpath(p)
|
||||
if p2 in seen:
|
||||
continue
|
||||
seen.add(p2)
|
||||
out.append(p2)
|
||||
return out
|
||||
|
||||
|
||||
def _copy_unique(src: Path, dst_dir: Path, base_name: str) -> Path:
|
||||
ext = src.suffix.lower() if src.suffix else ""
|
||||
candidate = f"{base_name}{ext}"
|
||||
dst = dst_dir / candidate
|
||||
|
||||
if dst.exists():
|
||||
i = 2
|
||||
while True:
|
||||
dst = dst_dir / f"{base_name}({i}){ext}"
|
||||
if not dst.exists():
|
||||
break
|
||||
i += 1
|
||||
|
||||
shutil.copy2(src, dst)
|
||||
return dst
|
||||
|
||||
|
||||
def bundle_search_results(
|
||||
con: sqlite3.Connection,
|
||||
results: Iterable[Dict[str, Any]],
|
||||
out_dir: str,
|
||||
*,
|
||||
copy_files: bool = True,
|
||||
merge_text: bool = True,
|
||||
merge_pdf: bool = True, # NEW
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
results: iterable dictов где есть минимум:
|
||||
- resume_id
|
||||
- candidate_id
|
||||
- name (желательно)
|
||||
|
||||
Создаёт:
|
||||
- files/: скопированные исходные файлы резюме
|
||||
- merged_resumes.txt: склейка текста clean_text из БД (если merge_text)
|
||||
- pdf/merged.pdf: склейка всех PDF из files/ (если merge_pdf и pypdf установлен)
|
||||
- manifest.json
|
||||
- README.txt
|
||||
"""
|
||||
out_root = Path(out_dir).resolve()
|
||||
files_dir = out_root / "files"
|
||||
_safe_mkdir(files_dir)
|
||||
|
||||
manifest: List[Dict[str, Any]] = []
|
||||
copied = 0
|
||||
missing = 0
|
||||
|
||||
merged_parts: List[str] = []
|
||||
merged_txt_path = out_root / "merged_resumes.txt"
|
||||
|
||||
for item in results:
|
||||
resume_id = item.get("resume_id")
|
||||
cand_id = item.get("candidate_id")
|
||||
name = item.get("name") or ""
|
||||
if not resume_id or not cand_id:
|
||||
continue
|
||||
|
||||
# merged TXT из БД
|
||||
if merge_text:
|
||||
row = con.execute(
|
||||
"SELECT clean_text FROM resumes WHERE resume_id=?",
|
||||
(resume_id,),
|
||||
).fetchone()
|
||||
clean_text = (row["clean_text"] if row else "") or ""
|
||||
header = f"===== {name or cand_id} | {cand_id} | {resume_id} ====="
|
||||
merged_parts.append(header)
|
||||
merged_parts.append(clean_text.strip())
|
||||
merged_parts.append("")
|
||||
|
||||
if not copy_files:
|
||||
continue
|
||||
|
||||
src_paths = _pick_source_paths(con, resume_id)
|
||||
|
||||
src_found: Optional[Path] = None
|
||||
for sp in src_paths:
|
||||
p = Path(sp)
|
||||
if p.exists() and p.is_file():
|
||||
src_found = p
|
||||
break
|
||||
|
||||
if not src_found:
|
||||
missing += 1
|
||||
manifest.append(
|
||||
{
|
||||
"candidate_id": cand_id,
|
||||
"name": name,
|
||||
"resume_id": resume_id,
|
||||
"copied": False,
|
||||
"reason": "source_file_not_found",
|
||||
"tried_paths": src_paths,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
base = f"{_slug(name) or _slug(cand_id)}__{resume_id}"
|
||||
try:
|
||||
dst = _copy_unique(src_found, files_dir, base)
|
||||
copied += 1
|
||||
manifest.append(
|
||||
{
|
||||
"candidate_id": cand_id,
|
||||
"name": name,
|
||||
"resume_id": resume_id,
|
||||
"copied": True,
|
||||
"source_path": str(src_found),
|
||||
"dest_path": str(dst),
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
missing += 1
|
||||
manifest.append(
|
||||
{
|
||||
"candidate_id": cand_id,
|
||||
"name": name,
|
||||
"resume_id": resume_id,
|
||||
"copied": False,
|
||||
"reason": f"copy_failed: {repr(e)}",
|
||||
"source_path": str(src_found),
|
||||
}
|
||||
)
|
||||
|
||||
# merged TXT
|
||||
if merge_text:
|
||||
merged_txt_path.write_text("\n".join(merged_parts), encoding="utf-8", errors="ignore")
|
||||
|
||||
# NEW: merged PDF from files/*.pdf
|
||||
merged_pdf_path: Optional[Path] = None
|
||||
pdf_info: Optional[Dict[str, Any]] = None
|
||||
if merge_pdf and copy_files and merge_all_pdfs_in_dir is not None:
|
||||
try:
|
||||
merged_pdf_path = out_root / "pdf" / "merged.pdf"
|
||||
_safe_mkdir(merged_pdf_path.parent)
|
||||
pdf_info = merge_all_pdfs_in_dir(files_dir, merged_pdf_path)
|
||||
except Exception as e:
|
||||
pdf_info = {"error": f"merge_pdf_failed: {repr(e)}"}
|
||||
|
||||
# manifest.json
|
||||
(out_root / "manifest.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"out_dir": str(out_root),
|
||||
"copied_files": copied,
|
||||
"missing_files": missing,
|
||||
"merged_text": str(merged_txt_path) if merge_text else None,
|
||||
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
|
||||
"pdf_info": pdf_info,
|
||||
"items": manifest,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
),
|
||||
encoding="utf-8",
|
||||
errors="ignore",
|
||||
)
|
||||
|
||||
# README
|
||||
readme_lines = [
|
||||
"Папка создана командой search.",
|
||||
"- files/: скопированные исходные файлы резюме",
|
||||
"- merged_resumes.txt: склейка текста clean_text из БД",
|
||||
"- manifest.json: что откуда скопировалось / что не найдено",
|
||||
]
|
||||
if merge_pdf:
|
||||
if merge_all_pdfs_in_dir is None:
|
||||
readme_lines.append("- pdf/merged.pdf: НЕ создан (нужен пакет pypdf и модуль pdf_merge.py)")
|
||||
else:
|
||||
readme_lines.append("- pdf/merged.pdf: склейка всех PDF из files/ (если PDF были)")
|
||||
|
||||
(out_root / "README.txt").write_text(
|
||||
"\n".join(readme_lines) + "\n",
|
||||
encoding="utf-8",
|
||||
errors="ignore",
|
||||
)
|
||||
|
||||
return {
|
||||
"out_dir": str(out_root),
|
||||
"copied_files": copied,
|
||||
"missing_files": missing,
|
||||
"merged_text": str(merged_txt_path) if merge_text else None,
|
||||
"merged_pdf": str(merged_pdf_path) if merged_pdf_path else None,
|
||||
"manifest": str(out_root / "manifest.json"),
|
||||
"pdf_info": pdf_info,
|
||||
}
|
||||
Reference in New Issue
Block a user