Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

99
extract/text_extract.py Normal file
View File

@@ -0,0 +1,99 @@
from __future__ import annotations
import os
from pathlib import Path
import logging
from bs4 import BeautifulSoup
try: # optional dependency for PDF fallback
from pypdf import PdfReader as _PdfReader # type: ignore
except Exception: # pragma: no cover - optional import
try:
from PyPDF2 import PdfReader as _PdfReader # type: ignore
except Exception: # pragma: no cover
_PdfReader = None # type: ignore
def _read_bytes(path: Path) -> bytes:
return path.read_bytes()
def extract_text_from_txt(path: Path) -> str:
data = _read_bytes(path)
for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
try:
return data.decode(enc, errors="ignore")
except Exception:
continue
return data.decode("utf-8", errors="ignore")
def extract_text_from_html(path: Path) -> str:
html = extract_text_from_txt(path)
soup = BeautifulSoup(html, "lxml")
return soup.get_text("\n", strip=True)
def extract_text_from_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
parts = []
for p in doc.paragraphs:
if p.text and p.text.strip():
parts.append(p.text.strip())
for table in doc.tables:
for row in table.rows:
cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
if cells:
parts.append(" | ".join(cells))
return "\n".join(parts)
_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
logging.getLogger("pypdf").setLevel(logging.ERROR)
logging.getLogger("PyPDF2").setLevel(logging.ERROR)
def extract_text_from_pdf(path: Path) -> str:
"""
Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
"""
if _PdfReader is None:
raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
try:
reader = _PdfReader(str(path), strict=False)
except Exception as exc: # pragma: no cover - pdf parser edge cases
raise RuntimeError(f"PDF read failed: {exc}") from exc
parts = []
for idx, page in enumerate(getattr(reader, "pages", [])):
if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
break
try:
text = page.extract_text() # type: ignore[attr-defined]
except Exception:
text = None
if text:
parts.append(text)
return "\n".join(parts)
def extract_text_from_doc_best_effort(path: Path) -> str:
# .doc requires external tools; best-effort if textract installed
try:
import textract # type: ignore
b = textract.process(str(path))
return b.decode("utf-8", errors="ignore")
except Exception:
return ""
def extract_text(path: Path) -> str:
ext = path.suffix.lower()
if ext in (".txt", ".log"):
return extract_text_from_txt(path)
if ext in (".html", ".htm"):
return extract_text_from_html(path)
if ext == ".docx":
return extract_text_from_docx(path)
if ext == ".pdf":
return extract_text_from_pdf(path)
if ext == ".doc":
return extract_text_from_doc_best_effort(path)
return ""