from __future__ import annotations import json from pathlib import Path from typing import Dict, Iterator, List, Optional RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"} def find_result_json(root: Path) -> List[Path]: return list(root.rglob("result.json")) def _text_field_to_str(text_field) -> str: if isinstance(text_field, str): return text_field if isinstance(text_field, list): parts = [] for item in text_field: if isinstance(item, str): parts.append(item) elif isinstance(item, dict) and "text" in item: parts.append(str(item["text"])) return "".join(parts) return "" def iter_artifacts(result_json: Path) -> Iterator[Dict]: data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore")) chats = [] if isinstance(data, dict): chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or [] for chat in chats: chat_title = chat.get("name") or chat.get("title") or "unknown_chat" messages = chat.get("messages", []) or [] for msg in messages: msg_id = str(msg.get("id") or "") msg_date = msg.get("date") or msg.get("date_unixtime") or None text = _text_field_to_str(msg.get("text", "")) file_rel = msg.get("file") or None file_path = None original_name = None if file_rel: p = (result_json.parent / file_rel).resolve() if p.exists() and p.suffix.lower() in RESUME_EXTS: file_path = str(p) original_name = p.name if file_path: yield { "origin_type": "telegram_json", "export_path": str(result_json.parent), "chat_title": chat_title, "message_id": msg_id, "message_date": str(msg_date) if msg_date is not None else None, "message_text": text or "", "file_path": file_path, "original_name": original_name, "extra": {"json_path": str(result_json)}, } else: # message-only resume paste (heuristic) if text and len(text.strip()) >= 500: yield { "origin_type": "message_text", "export_path": str(result_json.parent), "chat_title": chat_title, "message_id": msg_id, "message_date": str(msg_date) if msg_date is not None else None, "message_text": text, "file_path": None, "original_name": None, "extra": {"json_path": str(result_json)}, }