Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

View File

@@ -0,0 +1,73 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Dict, Iterator, List, Optional
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
def find_result_json(root: Path) -> List[Path]:
return list(root.rglob("result.json"))
def _text_field_to_str(text_field) -> str:
if isinstance(text_field, str):
return text_field
if isinstance(text_field, list):
parts = []
for item in text_field:
if isinstance(item, str):
parts.append(item)
elif isinstance(item, dict) and "text" in item:
parts.append(str(item["text"]))
return "".join(parts)
return ""
def iter_artifacts(result_json: Path) -> Iterator[Dict]:
data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore"))
chats = []
if isinstance(data, dict):
chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or []
for chat in chats:
chat_title = chat.get("name") or chat.get("title") or "unknown_chat"
messages = chat.get("messages", []) or []
for msg in messages:
msg_id = str(msg.get("id") or "")
msg_date = msg.get("date") or msg.get("date_unixtime") or None
text = _text_field_to_str(msg.get("text", ""))
file_rel = msg.get("file") or None
file_path = None
original_name = None
if file_rel:
p = (result_json.parent / file_rel).resolve()
if p.exists() and p.suffix.lower() in RESUME_EXTS:
file_path = str(p)
original_name = p.name
if file_path:
yield {
"origin_type": "telegram_json",
"export_path": str(result_json.parent),
"chat_title": chat_title,
"message_id": msg_id,
"message_date": str(msg_date) if msg_date is not None else None,
"message_text": text or "",
"file_path": file_path,
"original_name": original_name,
"extra": {"json_path": str(result_json)},
}
else:
# message-only resume paste (heuristic)
if text and len(text.strip()) >= 500:
yield {
"origin_type": "message_text",
"export_path": str(result_json.parent),
"chat_title": chat_title,
"message_id": msg_id,
"message_date": str(msg_date) if msg_date is not None else None,
"message_text": text,
"file_path": None,
"original_name": None,
"extra": {"json_path": str(result_json)},
}