Files
tg_resume_db/extract/pdf_extract.py
2026-03-11 15:27:10 +03:00

212 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple
try: # optional dependency
from pypdf import PdfReader # type: ignore
except Exception: # pragma: no cover
try:
from PyPDF2 import PdfReader # type: ignore
except Exception: # pragma: no cover
PdfReader = None # type: ignore
try: # optional dependency
from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore
except Exception: # pragma: no cover
pdfminer_extract_text = None # type: ignore
@dataclass
class PdfExtractResult:
text: str
pages: List[dict]
method: str
score: float
flags: List[str]
_SECTION_HINTS = [
"experience", "work experience", "skills", "education", "projects", "summary", "about",
"опыт работы", "навыки", "образование", "проекты", "о себе",
]
def _which_pdftotext() -> Optional[str]:
exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
return exe
def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
exe = _which_pdftotext()
if not exe:
return ""
cmd = [exe]
if layout:
cmd.append("-layout")
cmd += ["-nopgbrk", str(path), "-"]
try:
p = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout_sec,
check=False,
text=True,
encoding="utf-8",
errors="ignore",
)
return (p.stdout or "").strip()
except Exception:
return ""
def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
if PdfReader is None:
return []
try:
reader = PdfReader(str(path), strict=False)
except Exception:
return []
pages: List[dict] = []
for i, page in enumerate(getattr(reader, "pages", [])):
if max_pages and i >= max_pages:
break
try:
text = page.extract_text() or ""
except Exception:
text = ""
pages.append({"page": i + 1, "text": text})
return pages
def _extract_pdfminer(path: Path) -> str:
if pdfminer_extract_text is None:
return ""
try:
return (pdfminer_extract_text(str(path)) or "").strip()
except Exception:
return ""
def _quality_score(text: str) -> Tuple[float, List[str]]:
flags: List[str] = []
if not text:
return 0.0, ["empty"]
total = len(text)
letters = sum(ch.isalpha() for ch in text)
spaces = text.count(" ")
alpha_ratio = letters / max(1, total)
space_ratio = spaces / max(1, total)
words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
lines = [ln for ln in text.splitlines() if ln.strip()]
long_lines = [ln for ln in lines if len(ln) > 200]
long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
score = 0.0
if alpha_ratio >= 0.45:
score += 2.0
elif alpha_ratio >= 0.30:
score += 1.0
else:
flags.append("low_alpha")
if 0.10 <= space_ratio <= 0.28:
score += 1.0
else:
flags.append("odd_spacing")
if 3.5 <= avg_word_len <= 9.0:
score += 1.0
else:
flags.append("odd_word_len")
if long_line_ratio <= 0.06:
score += 1.0
else:
flags.append("long_lines")
if glued_hits <= 6:
score += 1.0
else:
flags.append("glued_text")
if section_hits >= 2:
score += 1.0
elif section_hits == 1:
score += 0.5
if total < 200:
flags.append("short_text")
if alpha_ratio < 0.08 or total < 120:
flags.append("scan_like")
return score, flags
def deglue_text(text: str) -> str:
if not text:
return text
t = text
t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
return t
def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
candidates: List[Tuple[str, str]] = []
txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
if txt_layout:
candidates.append(("pdftotext_layout", txt_layout))
txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
if txt_plain:
candidates.append(("pdftotext_plain", txt_plain))
txt_pypdf = ""
if PdfReader is not None:
pages = _extract_pages_pypdf(path)
if pages:
txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
if txt_pypdf:
candidates.append(("pypdf", txt_pypdf))
txt_pdfminer = _extract_pdfminer(path)
if txt_pdfminer:
candidates.append(("pdfminer", txt_pdfminer))
if not candidates:
return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
best_method = "none"
best_text = ""
best_score = -1.0
best_flags: List[str] = []
for method, text in candidates:
score, flags = _quality_score(text)
if score > best_score:
best_score = score
best_method = method
best_text = text
best_flags = flags
pages = _extract_pages_pypdf(path)
best_text = deglue_text(best_text)
return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)