Initial commit
This commit is contained in:
211
extract/pdf_extract.py
Normal file
211
extract/pdf_extract.py
Normal file
@@ -0,0 +1,211 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
try: # optional dependency
|
||||
from pypdf import PdfReader # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
try:
|
||||
from PyPDF2 import PdfReader # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
PdfReader = None # type: ignore
|
||||
|
||||
try: # optional dependency
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
pdfminer_extract_text = None # type: ignore
|
||||
|
||||
|
||||
@dataclass
|
||||
class PdfExtractResult:
|
||||
text: str
|
||||
pages: List[dict]
|
||||
method: str
|
||||
score: float
|
||||
flags: List[str]
|
||||
|
||||
|
||||
_SECTION_HINTS = [
|
||||
"experience", "work experience", "skills", "education", "projects", "summary", "about",
|
||||
"опыт работы", "навыки", "образование", "проекты", "о себе",
|
||||
]
|
||||
|
||||
|
||||
def _which_pdftotext() -> Optional[str]:
|
||||
exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
|
||||
return exe
|
||||
|
||||
|
||||
def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
|
||||
exe = _which_pdftotext()
|
||||
if not exe:
|
||||
return ""
|
||||
cmd = [exe]
|
||||
if layout:
|
||||
cmd.append("-layout")
|
||||
cmd += ["-nopgbrk", str(path), "-"]
|
||||
try:
|
||||
p = subprocess.run(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=timeout_sec,
|
||||
check=False,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="ignore",
|
||||
)
|
||||
return (p.stdout or "").strip()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
|
||||
if PdfReader is None:
|
||||
return []
|
||||
try:
|
||||
reader = PdfReader(str(path), strict=False)
|
||||
except Exception:
|
||||
return []
|
||||
pages: List[dict] = []
|
||||
for i, page in enumerate(getattr(reader, "pages", [])):
|
||||
if max_pages and i >= max_pages:
|
||||
break
|
||||
try:
|
||||
text = page.extract_text() or ""
|
||||
except Exception:
|
||||
text = ""
|
||||
pages.append({"page": i + 1, "text": text})
|
||||
return pages
|
||||
|
||||
|
||||
def _extract_pdfminer(path: Path) -> str:
|
||||
if pdfminer_extract_text is None:
|
||||
return ""
|
||||
try:
|
||||
return (pdfminer_extract_text(str(path)) or "").strip()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _quality_score(text: str) -> Tuple[float, List[str]]:
|
||||
flags: List[str] = []
|
||||
if not text:
|
||||
return 0.0, ["empty"]
|
||||
|
||||
total = len(text)
|
||||
letters = sum(ch.isalpha() for ch in text)
|
||||
spaces = text.count(" ")
|
||||
alpha_ratio = letters / max(1, total)
|
||||
space_ratio = spaces / max(1, total)
|
||||
|
||||
words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
|
||||
avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
|
||||
|
||||
lines = [ln for ln in text.splitlines() if ln.strip()]
|
||||
long_lines = [ln for ln in lines if len(ln) > 200]
|
||||
long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
|
||||
|
||||
glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
|
||||
|
||||
section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
|
||||
|
||||
score = 0.0
|
||||
if alpha_ratio >= 0.45:
|
||||
score += 2.0
|
||||
elif alpha_ratio >= 0.30:
|
||||
score += 1.0
|
||||
else:
|
||||
flags.append("low_alpha")
|
||||
|
||||
if 0.10 <= space_ratio <= 0.28:
|
||||
score += 1.0
|
||||
else:
|
||||
flags.append("odd_spacing")
|
||||
|
||||
if 3.5 <= avg_word_len <= 9.0:
|
||||
score += 1.0
|
||||
else:
|
||||
flags.append("odd_word_len")
|
||||
|
||||
if long_line_ratio <= 0.06:
|
||||
score += 1.0
|
||||
else:
|
||||
flags.append("long_lines")
|
||||
|
||||
if glued_hits <= 6:
|
||||
score += 1.0
|
||||
else:
|
||||
flags.append("glued_text")
|
||||
|
||||
if section_hits >= 2:
|
||||
score += 1.0
|
||||
elif section_hits == 1:
|
||||
score += 0.5
|
||||
|
||||
if total < 200:
|
||||
flags.append("short_text")
|
||||
|
||||
if alpha_ratio < 0.08 or total < 120:
|
||||
flags.append("scan_like")
|
||||
|
||||
return score, flags
|
||||
|
||||
|
||||
def deglue_text(text: str) -> str:
|
||||
if not text:
|
||||
return text
|
||||
t = text
|
||||
t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
|
||||
t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
|
||||
t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
|
||||
t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
|
||||
return t
|
||||
|
||||
|
||||
def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
|
||||
candidates: List[Tuple[str, str]] = []
|
||||
|
||||
txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
|
||||
if txt_layout:
|
||||
candidates.append(("pdftotext_layout", txt_layout))
|
||||
|
||||
txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
|
||||
if txt_plain:
|
||||
candidates.append(("pdftotext_plain", txt_plain))
|
||||
|
||||
txt_pypdf = ""
|
||||
if PdfReader is not None:
|
||||
pages = _extract_pages_pypdf(path)
|
||||
if pages:
|
||||
txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
|
||||
if txt_pypdf:
|
||||
candidates.append(("pypdf", txt_pypdf))
|
||||
|
||||
txt_pdfminer = _extract_pdfminer(path)
|
||||
if txt_pdfminer:
|
||||
candidates.append(("pdfminer", txt_pdfminer))
|
||||
|
||||
if not candidates:
|
||||
return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
|
||||
|
||||
best_method = "none"
|
||||
best_text = ""
|
||||
best_score = -1.0
|
||||
best_flags: List[str] = []
|
||||
for method, text in candidates:
|
||||
score, flags = _quality_score(text)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_method = method
|
||||
best_text = text
|
||||
best_flags = flags
|
||||
|
||||
pages = _extract_pages_pypdf(path)
|
||||
best_text = deglue_text(best_text)
|
||||
return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)
|
||||
Reference in New Issue
Block a user