74 lines
2.9 KiB
Python
74 lines
2.9 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Dict, Iterator, List, Optional
|
|
|
|
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
|
|
|
def find_result_json(root: Path) -> List[Path]:
|
|
return list(root.rglob("result.json"))
|
|
|
|
def _text_field_to_str(text_field) -> str:
|
|
if isinstance(text_field, str):
|
|
return text_field
|
|
if isinstance(text_field, list):
|
|
parts = []
|
|
for item in text_field:
|
|
if isinstance(item, str):
|
|
parts.append(item)
|
|
elif isinstance(item, dict) and "text" in item:
|
|
parts.append(str(item["text"]))
|
|
return "".join(parts)
|
|
return ""
|
|
|
|
def iter_artifacts(result_json: Path) -> Iterator[Dict]:
|
|
data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore"))
|
|
|
|
chats = []
|
|
if isinstance(data, dict):
|
|
chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or []
|
|
for chat in chats:
|
|
chat_title = chat.get("name") or chat.get("title") or "unknown_chat"
|
|
messages = chat.get("messages", []) or []
|
|
for msg in messages:
|
|
msg_id = str(msg.get("id") or "")
|
|
msg_date = msg.get("date") or msg.get("date_unixtime") or None
|
|
text = _text_field_to_str(msg.get("text", ""))
|
|
|
|
file_rel = msg.get("file") or None
|
|
file_path = None
|
|
original_name = None
|
|
if file_rel:
|
|
p = (result_json.parent / file_rel).resolve()
|
|
if p.exists() and p.suffix.lower() in RESUME_EXTS:
|
|
file_path = str(p)
|
|
original_name = p.name
|
|
|
|
if file_path:
|
|
yield {
|
|
"origin_type": "telegram_json",
|
|
"export_path": str(result_json.parent),
|
|
"chat_title": chat_title,
|
|
"message_id": msg_id,
|
|
"message_date": str(msg_date) if msg_date is not None else None,
|
|
"message_text": text or "",
|
|
"file_path": file_path,
|
|
"original_name": original_name,
|
|
"extra": {"json_path": str(result_json)},
|
|
}
|
|
else:
|
|
# message-only resume paste (heuristic)
|
|
if text and len(text.strip()) >= 500:
|
|
yield {
|
|
"origin_type": "message_text",
|
|
"export_path": str(result_json.parent),
|
|
"chat_title": chat_title,
|
|
"message_id": msg_id,
|
|
"message_date": str(msg_date) if msg_date is not None else None,
|
|
"message_text": text,
|
|
"file_path": None,
|
|
"original_name": None,
|
|
"extra": {"json_path": str(result_json)},
|
|
}
|