tg_resume_db/extract/text_extract.py

from __future__ import annotations

import os
from pathlib import Path
import logging
from bs4 import BeautifulSoup

try:  # optional dependency for PDF fallback
    from pypdf import PdfReader as _PdfReader  # type: ignore
except Exception:  # pragma: no cover - optional import
    try:
        from PyPDF2 import PdfReader as _PdfReader  # type: ignore
    except Exception:  # pragma: no cover
        _PdfReader = None  # type: ignore

def _read_bytes(path: Path) -> bytes:
    return path.read_bytes()

def extract_text_from_txt(path: Path) -> str:
    data = _read_bytes(path)
    for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
        try:
            return data.decode(enc, errors="ignore")
        except Exception:
            continue
    return data.decode("utf-8", errors="ignore")

def extract_text_from_html(path: Path) -> str:
    html = extract_text_from_txt(path)
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text("\n", strip=True)

def extract_text_from_docx(path: Path) -> str:
    from docx import Document
    doc = Document(str(path))
    parts = []
    for p in doc.paragraphs:
        if p.text and p.text.strip():
            parts.append(p.text.strip())
    for table in doc.tables:
        for row in table.rows:
            cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
            if cells:
                parts.append(" | ".join(cells))
    return "\n".join(parts)

_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
logging.getLogger("pypdf").setLevel(logging.ERROR)
logging.getLogger("PyPDF2").setLevel(logging.ERROR)


def extract_text_from_pdf(path: Path) -> str:
    """
    Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
    Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
    """
    if _PdfReader is None:
        raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")

    try:
        reader = _PdfReader(str(path), strict=False)
    except Exception as exc:  # pragma: no cover - pdf parser edge cases
        raise RuntimeError(f"PDF read failed: {exc}") from exc

    parts = []
    for idx, page in enumerate(getattr(reader, "pages", [])):
        if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
            break
        try:
            text = page.extract_text()  # type: ignore[attr-defined]
        except Exception:
            text = None
        if text:
            parts.append(text)
    return "\n".join(parts)

def extract_text_from_doc_best_effort(path: Path) -> str:
    # .doc requires external tools; best-effort if textract installed
    try:
        import textract  # type: ignore
        b = textract.process(str(path))
        return b.decode("utf-8", errors="ignore")
    except Exception:
        return ""

def extract_text(path: Path) -> str:
    ext = path.suffix.lower()
    if ext in (".txt", ".log"):
        return extract_text_from_txt(path)
    if ext in (".html", ".htm"):
        return extract_text_from_html(path)
    if ext == ".docx":
        return extract_text_from_docx(path)
    if ext == ".pdf":
        return extract_text_from_pdf(path)
    if ext == ".doc":
        return extract_text_from_doc_best_effort(path)
    return ""