Initial commit

2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions
--- a/extract/text_extract.py
+++ b/extract/text_extract.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import logging
+from bs4 import BeautifulSoup
+
+try:  # optional dependency for PDF fallback
+    from pypdf import PdfReader as _PdfReader  # type: ignore
+except Exception:  # pragma: no cover - optional import
+    try:
+        from PyPDF2 import PdfReader as _PdfReader  # type: ignore
+    except Exception:  # pragma: no cover
+        _PdfReader = None  # type: ignore
+
+def _read_bytes(path: Path) -> bytes:
+    return path.read_bytes()
+
+def extract_text_from_txt(path: Path) -> str:
+    data = _read_bytes(path)
+    for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
+        try:
+            return data.decode(enc, errors="ignore")
+        except Exception:
+            continue
+    return data.decode("utf-8", errors="ignore")
+
+def extract_text_from_html(path: Path) -> str:
+    html = extract_text_from_txt(path)
+    soup = BeautifulSoup(html, "lxml")
+    return soup.get_text("\n", strip=True)
+
+def extract_text_from_docx(path: Path) -> str:
+    from docx import Document
+    doc = Document(str(path))
+    parts = []
+    for p in doc.paragraphs:
+        if p.text and p.text.strip():
+            parts.append(p.text.strip())
+    for table in doc.tables:
+        for row in table.rows:
+            cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
+            if cells:
+                parts.append(" | ".join(cells))
+    return "\n".join(parts)
+
+_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
+# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
+logging.getLogger("pypdf").setLevel(logging.ERROR)
+logging.getLogger("PyPDF2").setLevel(logging.ERROR)
+
+
+def extract_text_from_pdf(path: Path) -> str:
+    """
+    Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
+    Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
+    """
+    if _PdfReader is None:
+        raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
+
+    try:
+        reader = _PdfReader(str(path), strict=False)
+    except Exception as exc:  # pragma: no cover - pdf parser edge cases
+        raise RuntimeError(f"PDF read failed: {exc}") from exc
+
+    parts = []
+    for idx, page in enumerate(getattr(reader, "pages", [])):
+        if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
+            break
+        try:
+            text = page.extract_text()  # type: ignore[attr-defined]
+        except Exception:
+            text = None
+        if text:
+            parts.append(text)
+    return "\n".join(parts)
+
+def extract_text_from_doc_best_effort(path: Path) -> str:
+    # .doc requires external tools; best-effort if textract installed
+    try:
+        import textract  # type: ignore
+        b = textract.process(str(path))
+        return b.decode("utf-8", errors="ignore")
+    except Exception:
+        return ""
+
+def extract_text(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in (".txt", ".log"):
+        return extract_text_from_txt(path)
+    if ext in (".html", ".htm"):
+        return extract_text_from_html(path)
+    if ext == ".docx":
+        return extract_text_from_docx(path)
+    if ext == ".pdf":
+        return extract_text_from_pdf(path)
+    if ext == ".doc":
+        return extract_text_from_doc_best_effort(path)
+    return ""