Initial commit
This commit is contained in:
21
importers/file_scan.py
Normal file
21
importers/file_scan.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator
|
||||
|
||||
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
||||
|
||||
def iter_files(root: Path) -> Iterator[Dict]:
|
||||
for p in root.rglob("*"):
|
||||
if p.is_file() and p.suffix.lower() in RESUME_EXTS:
|
||||
yield {
|
||||
"origin_type": "file_scan",
|
||||
"export_path": str(root),
|
||||
"chat_title": None,
|
||||
"message_id": None,
|
||||
"message_date": None,
|
||||
"message_text": "",
|
||||
"file_path": str(p.resolve()),
|
||||
"original_name": p.name,
|
||||
"extra": {},
|
||||
}
|
||||
66
importers/telegram_html.py
Normal file
66
importers/telegram_html.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
||||
|
||||
def find_messages_html(root: Path) -> List[Path]:
|
||||
return [p for p in root.rglob("messages*.html") if p.is_file()]
|
||||
|
||||
def iter_artifacts(messages_html: Path) -> Iterator[Dict]:
|
||||
html = messages_html.read_text(encoding="utf-8", errors="ignore")
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
chat_title = None
|
||||
h = soup.find(class_=re.compile(r"page_header", re.I))
|
||||
if h:
|
||||
chat_title = h.get_text(" ", strip=True)
|
||||
chat_title = chat_title or messages_html.parent.name
|
||||
|
||||
for msg in soup.select(".message.default.clearfix, .message"):
|
||||
message_id = msg.get("id") or None
|
||||
date_div = msg.select_one(".date")
|
||||
msg_date = date_div.get("title") if date_div else None
|
||||
|
||||
text_div = msg.select_one(".text")
|
||||
msg_text = text_div.get_text("\n", strip=True) if text_div else ""
|
||||
|
||||
file_path = None
|
||||
original_name = None
|
||||
for a in msg.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
p = (messages_html.parent / href).resolve()
|
||||
if p.exists() and p.suffix.lower() in RESUME_EXTS:
|
||||
file_path = str(p)
|
||||
original_name = p.name
|
||||
break
|
||||
|
||||
if file_path:
|
||||
yield {
|
||||
"origin_type": "telegram_html",
|
||||
"export_path": str(messages_html.parent),
|
||||
"chat_title": chat_title,
|
||||
"message_id": str(message_id) if message_id else None,
|
||||
"message_date": msg_date,
|
||||
"message_text": msg_text or "",
|
||||
"file_path": file_path,
|
||||
"original_name": original_name,
|
||||
"extra": {"html_path": str(messages_html)},
|
||||
}
|
||||
else:
|
||||
if msg_text and len(msg_text.strip()) >= 500:
|
||||
yield {
|
||||
"origin_type": "message_text",
|
||||
"export_path": str(messages_html.parent),
|
||||
"chat_title": chat_title,
|
||||
"message_id": str(message_id) if message_id else None,
|
||||
"message_date": msg_date,
|
||||
"message_text": msg_text,
|
||||
"file_path": None,
|
||||
"original_name": None,
|
||||
"extra": {"html_path": str(messages_html)},
|
||||
}
|
||||
73
importers/telegram_json.py
Normal file
73
importers/telegram_json.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional
|
||||
|
||||
RESUME_EXTS = {".pdf", ".docx", ".doc", ".txt", ".html", ".htm"}
|
||||
|
||||
def find_result_json(root: Path) -> List[Path]:
|
||||
return list(root.rglob("result.json"))
|
||||
|
||||
def _text_field_to_str(text_field) -> str:
|
||||
if isinstance(text_field, str):
|
||||
return text_field
|
||||
if isinstance(text_field, list):
|
||||
parts = []
|
||||
for item in text_field:
|
||||
if isinstance(item, str):
|
||||
parts.append(item)
|
||||
elif isinstance(item, dict) and "text" in item:
|
||||
parts.append(str(item["text"]))
|
||||
return "".join(parts)
|
||||
return ""
|
||||
|
||||
def iter_artifacts(result_json: Path) -> Iterator[Dict]:
|
||||
data = json.loads(result_json.read_text(encoding="utf-8", errors="ignore"))
|
||||
|
||||
chats = []
|
||||
if isinstance(data, dict):
|
||||
chats = data.get("chats", {}).get("list", []) or data.get("chats", []) or []
|
||||
for chat in chats:
|
||||
chat_title = chat.get("name") or chat.get("title") or "unknown_chat"
|
||||
messages = chat.get("messages", []) or []
|
||||
for msg in messages:
|
||||
msg_id = str(msg.get("id") or "")
|
||||
msg_date = msg.get("date") or msg.get("date_unixtime") or None
|
||||
text = _text_field_to_str(msg.get("text", ""))
|
||||
|
||||
file_rel = msg.get("file") or None
|
||||
file_path = None
|
||||
original_name = None
|
||||
if file_rel:
|
||||
p = (result_json.parent / file_rel).resolve()
|
||||
if p.exists() and p.suffix.lower() in RESUME_EXTS:
|
||||
file_path = str(p)
|
||||
original_name = p.name
|
||||
|
||||
if file_path:
|
||||
yield {
|
||||
"origin_type": "telegram_json",
|
||||
"export_path": str(result_json.parent),
|
||||
"chat_title": chat_title,
|
||||
"message_id": msg_id,
|
||||
"message_date": str(msg_date) if msg_date is not None else None,
|
||||
"message_text": text or "",
|
||||
"file_path": file_path,
|
||||
"original_name": original_name,
|
||||
"extra": {"json_path": str(result_json)},
|
||||
}
|
||||
else:
|
||||
# message-only resume paste (heuristic)
|
||||
if text and len(text.strip()) >= 500:
|
||||
yield {
|
||||
"origin_type": "message_text",
|
||||
"export_path": str(result_json.parent),
|
||||
"chat_title": chat_title,
|
||||
"message_id": msg_id,
|
||||
"message_date": str(msg_date) if msg_date is not None else None,
|
||||
"message_text": text,
|
||||
"file_path": None,
|
||||
"original_name": None,
|
||||
"extra": {"json_path": str(result_json)},
|
||||
}
|
||||
Reference in New Issue
Block a user