tg_resume_db/extract/pdf_extract.py

from __future__ import annotations

import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple

try:  # optional dependency
    from pypdf import PdfReader  # type: ignore
except Exception:  # pragma: no cover
    try:
        from PyPDF2 import PdfReader  # type: ignore
    except Exception:  # pragma: no cover
        PdfReader = None  # type: ignore

try:  # optional dependency
    from pdfminer.high_level import extract_text as pdfminer_extract_text  # type: ignore
except Exception:  # pragma: no cover
    pdfminer_extract_text = None  # type: ignore


@dataclass
class PdfExtractResult:
    text: str
    pages: List[dict]
    method: str
    score: float
    flags: List[str]


_SECTION_HINTS = [
    "experience", "work experience", "skills", "education", "projects", "summary", "about",
    "опыт работы", "навыки", "образование", "проекты", "о себе",
]


def _which_pdftotext() -> Optional[str]:
    exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
    return exe


def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
    exe = _which_pdftotext()
    if not exe:
        return ""
    cmd = [exe]
    if layout:
        cmd.append("-layout")
    cmd += ["-nopgbrk", str(path), "-"]
    try:
        p = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout_sec,
            check=False,
            text=True,
            encoding="utf-8",
            errors="ignore",
        )
        return (p.stdout or "").strip()
    except Exception:
        return ""


def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
    if PdfReader is None:
        return []
    try:
        reader = PdfReader(str(path), strict=False)
    except Exception:
        return []
    pages: List[dict] = []
    for i, page in enumerate(getattr(reader, "pages", [])):
        if max_pages and i >= max_pages:
            break
        try:
            text = page.extract_text() or ""
        except Exception:
            text = ""
        pages.append({"page": i + 1, "text": text})
    return pages


def _extract_pdfminer(path: Path) -> str:
    if pdfminer_extract_text is None:
        return ""
    try:
        return (pdfminer_extract_text(str(path)) or "").strip()
    except Exception:
        return ""


def _quality_score(text: str) -> Tuple[float, List[str]]:
    flags: List[str] = []
    if not text:
        return 0.0, ["empty"]

    total = len(text)
    letters = sum(ch.isalpha() for ch in text)
    spaces = text.count(" ")
    alpha_ratio = letters / max(1, total)
    space_ratio = spaces / max(1, total)

    words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
    avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0

    lines = [ln for ln in text.splitlines() if ln.strip()]
    long_lines = [ln for ln in lines if len(ln) > 200]
    long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0

    glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))

    section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())

    score = 0.0
    if alpha_ratio >= 0.45:
        score += 2.0
    elif alpha_ratio >= 0.30:
        score += 1.0
    else:
        flags.append("low_alpha")

    if 0.10 <= space_ratio <= 0.28:
        score += 1.0
    else:
        flags.append("odd_spacing")

    if 3.5 <= avg_word_len <= 9.0:
        score += 1.0
    else:
        flags.append("odd_word_len")

    if long_line_ratio <= 0.06:
        score += 1.0
    else:
        flags.append("long_lines")

    if glued_hits <= 6:
        score += 1.0
    else:
        flags.append("glued_text")

    if section_hits >= 2:
        score += 1.0
    elif section_hits == 1:
        score += 0.5

    if total < 200:
        flags.append("short_text")

    if alpha_ratio < 0.08 or total < 120:
        flags.append("scan_like")

    return score, flags


def deglue_text(text: str) -> str:
    if not text:
        return text
    t = text
    t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
    t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
    t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
    t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
    return t


def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
    candidates: List[Tuple[str, str]] = []

    txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
    if txt_layout:
        candidates.append(("pdftotext_layout", txt_layout))

    txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
    if txt_plain:
        candidates.append(("pdftotext_plain", txt_plain))

    txt_pypdf = ""
    if PdfReader is not None:
        pages = _extract_pages_pypdf(path)
        if pages:
            txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
    if txt_pypdf:
        candidates.append(("pypdf", txt_pypdf))

    txt_pdfminer = _extract_pdfminer(path)
    if txt_pdfminer:
        candidates.append(("pdfminer", txt_pdfminer))

    if not candidates:
        return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])

    best_method = "none"
    best_text = ""
    best_score = -1.0
    best_flags: List[str] = []
    for method, text in candidates:
        score, flags = _quality_score(text)
        if score > best_score:
            best_score = score
            best_method = method
            best_text = text
            best_flags = flags

    pages = _extract_pages_pypdf(path)
    best_text = deglue_text(best_text)
    return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)