Initial commit
This commit is contained in:
99
extract/text_extract.py
Normal file
99
extract/text_extract.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
try: # optional dependency for PDF fallback
|
||||
from pypdf import PdfReader as _PdfReader # type: ignore
|
||||
except Exception: # pragma: no cover - optional import
|
||||
try:
|
||||
from PyPDF2 import PdfReader as _PdfReader # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
_PdfReader = None # type: ignore
|
||||
|
||||
def _read_bytes(path: Path) -> bytes:
|
||||
return path.read_bytes()
|
||||
|
||||
def extract_text_from_txt(path: Path) -> str:
|
||||
data = _read_bytes(path)
|
||||
for enc in ("utf-8", "utf-16", "cp1251", "latin-1"):
|
||||
try:
|
||||
return data.decode(enc, errors="ignore")
|
||||
except Exception:
|
||||
continue
|
||||
return data.decode("utf-8", errors="ignore")
|
||||
|
||||
def extract_text_from_html(path: Path) -> str:
|
||||
html = extract_text_from_txt(path)
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
return soup.get_text("\n", strip=True)
|
||||
|
||||
def extract_text_from_docx(path: Path) -> str:
|
||||
from docx import Document
|
||||
doc = Document(str(path))
|
||||
parts = []
|
||||
for p in doc.paragraphs:
|
||||
if p.text and p.text.strip():
|
||||
parts.append(p.text.strip())
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
cells = [c.text.strip() for c in row.cells if c.text and c.text.strip()]
|
||||
if cells:
|
||||
parts.append(" | ".join(cells))
|
||||
return "\n".join(parts)
|
||||
|
||||
_PDF_PAGE_LIMIT = int(os.environ.get("PDF_PAGE_LIMIT", "40"))
|
||||
# Silence noisy pypdf warnings like "Ignoring wrong pointing object ..."
|
||||
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
||||
logging.getLogger("PyPDF2").setLevel(logging.ERROR)
|
||||
|
||||
|
||||
def extract_text_from_pdf(path: Path) -> str:
|
||||
"""
|
||||
Lightweight PDF extractor; prefers optional PyPDF-based readers over heavy pdfminer.
|
||||
Reads at most PDF_PAGE_LIMIT pages (default 40) to avoid pathological files.
|
||||
"""
|
||||
if _PdfReader is None:
|
||||
raise RuntimeError("PDF reader dependency missing (install pypdf or PyPDF2)")
|
||||
|
||||
try:
|
||||
reader = _PdfReader(str(path), strict=False)
|
||||
except Exception as exc: # pragma: no cover - pdf parser edge cases
|
||||
raise RuntimeError(f"PDF read failed: {exc}") from exc
|
||||
|
||||
parts = []
|
||||
for idx, page in enumerate(getattr(reader, "pages", [])):
|
||||
if _PDF_PAGE_LIMIT and idx >= _PDF_PAGE_LIMIT:
|
||||
break
|
||||
try:
|
||||
text = page.extract_text() # type: ignore[attr-defined]
|
||||
except Exception:
|
||||
text = None
|
||||
if text:
|
||||
parts.append(text)
|
||||
return "\n".join(parts)
|
||||
|
||||
def extract_text_from_doc_best_effort(path: Path) -> str:
|
||||
# .doc requires external tools; best-effort if textract installed
|
||||
try:
|
||||
import textract # type: ignore
|
||||
b = textract.process(str(path))
|
||||
return b.decode("utf-8", errors="ignore")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
def extract_text(path: Path) -> str:
|
||||
ext = path.suffix.lower()
|
||||
if ext in (".txt", ".log"):
|
||||
return extract_text_from_txt(path)
|
||||
if ext in (".html", ".htm"):
|
||||
return extract_text_from_html(path)
|
||||
if ext == ".docx":
|
||||
return extract_text_from_docx(path)
|
||||
if ext == ".pdf":
|
||||
return extract_text_from_pdf(path)
|
||||
if ext == ".doc":
|
||||
return extract_text_from_doc_best_effort(path)
|
||||
return ""
|
||||
Reference in New Issue
Block a user