from __future__ import annotations import re from pathlib import Path from typing import Dict, Iterator, List, Optional from bs4 import BeautifulSoup RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"} def find_messages_html(root: Path) -> List[Path]: return [p for p in root.rglob("messages*.html") if p.is_file()] def iter_artifacts(messages_html: Path) -> Iterator[Dict]: html = messages_html.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(html, "lxml") chat_title = None h = soup.find(class_=re.compile(r"page_header", re.I)) if h: chat_title = h.get_text(" ", strip=True) chat_title = chat_title or messages_html.parent.name for msg in soup.select(".message.default.clearfix, .message"): message_id = msg.get("id") or None date_div = msg.select_one(".date") msg_date = date_div.get("title") if date_div else None text_div = msg.select_one(".text") msg_text = text_div.get_text("\n", strip=True) if text_div else "" file_path = None original_name = None for a in msg.find_all("a", href=True): href = a["href"] p = (messages_html.parent / href).resolve() if p.exists() and p.suffix.lower() in RESUME_EXTS: file_path = str(p) original_name = p.name break if file_path: yield { "origin_type": "telegram_html", "export_path": str(messages_html.parent), "chat_title": chat_title, "message_id": str(message_id) if message_id else None, "message_date": msg_date, "message_text": msg_text or "", "file_path": file_path, "original_name": original_name, "extra": {"html_path": str(messages_html)}, } else: if msg_text and len(msg_text.strip()) >= 500: yield { "origin_type": "message_text", "export_path": str(messages_html.parent), "chat_title": chat_title, "message_id": str(message_id) if message_id else None, "message_date": msg_date, "message_text": msg_text, "file_path": None, "original_name": None, "extra": {"html_path": str(messages_html)}, }