67 lines
2.4 KiB
Python
67 lines
2.4 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Iterator, List, Optional
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
|
|
|
def find_messages_html(root: Path) -> List[Path]:
|
|
return [p for p in root.rglob("messages*.html") if p.is_file()]
|
|
|
|
def iter_artifacts(messages_html: Path) -> Iterator[Dict]:
|
|
html = messages_html.read_text(encoding="utf-8", errors="ignore")
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
chat_title = None
|
|
h = soup.find(class_=re.compile(r"page_header", re.I))
|
|
if h:
|
|
chat_title = h.get_text(" ", strip=True)
|
|
chat_title = chat_title or messages_html.parent.name
|
|
|
|
for msg in soup.select(".message.default.clearfix, .message"):
|
|
message_id = msg.get("id") or None
|
|
date_div = msg.select_one(".date")
|
|
msg_date = date_div.get("title") if date_div else None
|
|
|
|
text_div = msg.select_one(".text")
|
|
msg_text = text_div.get_text("\n", strip=True) if text_div else ""
|
|
|
|
file_path = None
|
|
original_name = None
|
|
for a in msg.find_all("a", href=True):
|
|
href = a["href"]
|
|
p = (messages_html.parent / href).resolve()
|
|
if p.exists() and p.suffix.lower() in RESUME_EXTS:
|
|
file_path = str(p)
|
|
original_name = p.name
|
|
break
|
|
|
|
if file_path:
|
|
yield {
|
|
"origin_type": "telegram_html",
|
|
"export_path": str(messages_html.parent),
|
|
"chat_title": chat_title,
|
|
"message_id": str(message_id) if message_id else None,
|
|
"message_date": msg_date,
|
|
"message_text": msg_text or "",
|
|
"file_path": file_path,
|
|
"original_name": original_name,
|
|
"extra": {"html_path": str(messages_html)},
|
|
}
|
|
else:
|
|
if msg_text and len(msg_text.strip()) >= 500:
|
|
yield {
|
|
"origin_type": "message_text",
|
|
"export_path": str(messages_html.parent),
|
|
"chat_title": chat_title,
|
|
"message_id": str(message_id) if message_id else None,
|
|
"message_date": msg_date,
|
|
"message_text": msg_text,
|
|
"file_path": None,
|
|
"original_name": None,
|
|
"extra": {"html_path": str(messages_html)},
|
|
}
|