from __future__ import annotations import re import shutil import subprocess from dataclasses import dataclass from pathlib import Path from typing import List, Optional, Tuple try: # optional dependency from pypdf import PdfReader # type: ignore except Exception: # pragma: no cover try: from PyPDF2 import PdfReader # type: ignore except Exception: # pragma: no cover PdfReader = None # type: ignore try: # optional dependency from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore except Exception: # pragma: no cover pdfminer_extract_text = None # type: ignore @dataclass class PdfExtractResult: text: str pages: List[dict] method: str score: float flags: List[str] _SECTION_HINTS = [ "experience", "work experience", "skills", "education", "projects", "summary", "about", "опыт работы", "навыки", "образование", "проекты", "о себе", ] def _which_pdftotext() -> Optional[str]: exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe") return exe def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str: exe = _which_pdftotext() if not exe: return "" cmd = [exe] if layout: cmd.append("-layout") cmd += ["-nopgbrk", str(path), "-"] try: p = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout_sec, check=False, text=True, encoding="utf-8", errors="ignore", ) return (p.stdout or "").strip() except Exception: return "" def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]: if PdfReader is None: return [] try: reader = PdfReader(str(path), strict=False) except Exception: return [] pages: List[dict] = [] for i, page in enumerate(getattr(reader, "pages", [])): if max_pages and i >= max_pages: break try: text = page.extract_text() or "" except Exception: text = "" pages.append({"page": i + 1, "text": text}) return pages def _extract_pdfminer(path: Path) -> str: if pdfminer_extract_text is None: return "" try: return (pdfminer_extract_text(str(path)) or "").strip() except Exception: return "" def _quality_score(text: str) -> Tuple[float, List[str]]: flags: List[str] = [] if not text: return 0.0, ["empty"] total = len(text) letters = sum(ch.isalpha() for ch in text) spaces = text.count(" ") alpha_ratio = letters / max(1, total) space_ratio = spaces / max(1, total) words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text) avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0 lines = [ln for ln in text.splitlines() if ln.strip()] long_lines = [ln for ln in lines if len(ln) > 200] long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0 glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text)) section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower()) score = 0.0 if alpha_ratio >= 0.45: score += 2.0 elif alpha_ratio >= 0.30: score += 1.0 else: flags.append("low_alpha") if 0.10 <= space_ratio <= 0.28: score += 1.0 else: flags.append("odd_spacing") if 3.5 <= avg_word_len <= 9.0: score += 1.0 else: flags.append("odd_word_len") if long_line_ratio <= 0.06: score += 1.0 else: flags.append("long_lines") if glued_hits <= 6: score += 1.0 else: flags.append("glued_text") if section_hits >= 2: score += 1.0 elif section_hits == 1: score += 0.5 if total < 200: flags.append("short_text") if alpha_ratio < 0.08 or total < 120: flags.append("scan_like") return score, flags def deglue_text(text: str) -> str: if not text: return text t = text t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t) t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t) t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t) t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t) return t def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult: candidates: List[Tuple[str, str]] = [] txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec) if txt_layout: candidates.append(("pdftotext_layout", txt_layout)) txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec) if txt_plain: candidates.append(("pdftotext_plain", txt_plain)) txt_pypdf = "" if PdfReader is not None: pages = _extract_pages_pypdf(path) if pages: txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text")) if txt_pypdf: candidates.append(("pypdf", txt_pypdf)) txt_pdfminer = _extract_pdfminer(path) if txt_pdfminer: candidates.append(("pdfminer", txt_pdfminer)) if not candidates: return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"]) best_method = "none" best_text = "" best_score = -1.0 best_flags: List[str] = [] for method, text in candidates: score, flags = _quality_score(text) if score > best_score: best_score = score best_method = method best_text = text best_flags = flags pages = _extract_pages_pypdf(path) best_text = deglue_text(best_text) return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)