212 lines
5.8 KiB
Python
212 lines
5.8 KiB
Python
from __future__ import annotations
|
||
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import List, Optional, Tuple
|
||
|
||
try: # optional dependency
|
||
from pypdf import PdfReader # type: ignore
|
||
except Exception: # pragma: no cover
|
||
try:
|
||
from PyPDF2 import PdfReader # type: ignore
|
||
except Exception: # pragma: no cover
|
||
PdfReader = None # type: ignore
|
||
|
||
try: # optional dependency
|
||
from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore
|
||
except Exception: # pragma: no cover
|
||
pdfminer_extract_text = None # type: ignore
|
||
|
||
|
||
@dataclass
|
||
class PdfExtractResult:
|
||
text: str
|
||
pages: List[dict]
|
||
method: str
|
||
score: float
|
||
flags: List[str]
|
||
|
||
|
||
_SECTION_HINTS = [
|
||
"experience", "work experience", "skills", "education", "projects", "summary", "about",
|
||
"опыт работы", "навыки", "образование", "проекты", "о себе",
|
||
]
|
||
|
||
|
||
def _which_pdftotext() -> Optional[str]:
|
||
exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
|
||
return exe
|
||
|
||
|
||
def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
|
||
exe = _which_pdftotext()
|
||
if not exe:
|
||
return ""
|
||
cmd = [exe]
|
||
if layout:
|
||
cmd.append("-layout")
|
||
cmd += ["-nopgbrk", str(path), "-"]
|
||
try:
|
||
p = subprocess.run(
|
||
cmd,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE,
|
||
timeout=timeout_sec,
|
||
check=False,
|
||
text=True,
|
||
encoding="utf-8",
|
||
errors="ignore",
|
||
)
|
||
return (p.stdout or "").strip()
|
||
except Exception:
|
||
return ""
|
||
|
||
|
||
def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
|
||
if PdfReader is None:
|
||
return []
|
||
try:
|
||
reader = PdfReader(str(path), strict=False)
|
||
except Exception:
|
||
return []
|
||
pages: List[dict] = []
|
||
for i, page in enumerate(getattr(reader, "pages", [])):
|
||
if max_pages and i >= max_pages:
|
||
break
|
||
try:
|
||
text = page.extract_text() or ""
|
||
except Exception:
|
||
text = ""
|
||
pages.append({"page": i + 1, "text": text})
|
||
return pages
|
||
|
||
|
||
def _extract_pdfminer(path: Path) -> str:
|
||
if pdfminer_extract_text is None:
|
||
return ""
|
||
try:
|
||
return (pdfminer_extract_text(str(path)) or "").strip()
|
||
except Exception:
|
||
return ""
|
||
|
||
|
||
def _quality_score(text: str) -> Tuple[float, List[str]]:
|
||
flags: List[str] = []
|
||
if not text:
|
||
return 0.0, ["empty"]
|
||
|
||
total = len(text)
|
||
letters = sum(ch.isalpha() for ch in text)
|
||
spaces = text.count(" ")
|
||
alpha_ratio = letters / max(1, total)
|
||
space_ratio = spaces / max(1, total)
|
||
|
||
words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
|
||
avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
|
||
|
||
lines = [ln for ln in text.splitlines() if ln.strip()]
|
||
long_lines = [ln for ln in lines if len(ln) > 200]
|
||
long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
|
||
|
||
glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
|
||
|
||
section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
|
||
|
||
score = 0.0
|
||
if alpha_ratio >= 0.45:
|
||
score += 2.0
|
||
elif alpha_ratio >= 0.30:
|
||
score += 1.0
|
||
else:
|
||
flags.append("low_alpha")
|
||
|
||
if 0.10 <= space_ratio <= 0.28:
|
||
score += 1.0
|
||
else:
|
||
flags.append("odd_spacing")
|
||
|
||
if 3.5 <= avg_word_len <= 9.0:
|
||
score += 1.0
|
||
else:
|
||
flags.append("odd_word_len")
|
||
|
||
if long_line_ratio <= 0.06:
|
||
score += 1.0
|
||
else:
|
||
flags.append("long_lines")
|
||
|
||
if glued_hits <= 6:
|
||
score += 1.0
|
||
else:
|
||
flags.append("glued_text")
|
||
|
||
if section_hits >= 2:
|
||
score += 1.0
|
||
elif section_hits == 1:
|
||
score += 0.5
|
||
|
||
if total < 200:
|
||
flags.append("short_text")
|
||
|
||
if alpha_ratio < 0.08 or total < 120:
|
||
flags.append("scan_like")
|
||
|
||
return score, flags
|
||
|
||
|
||
def deglue_text(text: str) -> str:
|
||
if not text:
|
||
return text
|
||
t = text
|
||
t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
|
||
t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
|
||
t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
|
||
t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
|
||
return t
|
||
|
||
|
||
def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
|
||
candidates: List[Tuple[str, str]] = []
|
||
|
||
txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
|
||
if txt_layout:
|
||
candidates.append(("pdftotext_layout", txt_layout))
|
||
|
||
txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
|
||
if txt_plain:
|
||
candidates.append(("pdftotext_plain", txt_plain))
|
||
|
||
txt_pypdf = ""
|
||
if PdfReader is not None:
|
||
pages = _extract_pages_pypdf(path)
|
||
if pages:
|
||
txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
|
||
if txt_pypdf:
|
||
candidates.append(("pypdf", txt_pypdf))
|
||
|
||
txt_pdfminer = _extract_pdfminer(path)
|
||
if txt_pdfminer:
|
||
candidates.append(("pdfminer", txt_pdfminer))
|
||
|
||
if not candidates:
|
||
return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
|
||
|
||
best_method = "none"
|
||
best_text = ""
|
||
best_score = -1.0
|
||
best_flags: List[str] = []
|
||
for method, text in candidates:
|
||
score, flags = _quality_score(text)
|
||
if score > best_score:
|
||
best_score = score
|
||
best_method = method
|
||
best_text = text
|
||
best_flags = flags
|
||
|
||
pages = _extract_pages_pypdf(path)
|
||
best_text = deglue_text(best_text)
|
||
return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)
|