Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

211
extract/pdf_extract.py Normal file
View File

@@ -0,0 +1,211 @@
from __future__ import annotations
import re
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple
try: # optional dependency
from pypdf import PdfReader # type: ignore
except Exception: # pragma: no cover
try:
from PyPDF2 import PdfReader # type: ignore
except Exception: # pragma: no cover
PdfReader = None # type: ignore
try: # optional dependency
from pdfminer.high_level import extract_text as pdfminer_extract_text # type: ignore
except Exception: # pragma: no cover
pdfminer_extract_text = None # type: ignore
@dataclass
class PdfExtractResult:
text: str
pages: List[dict]
method: str
score: float
flags: List[str]
_SECTION_HINTS = [
"experience", "work experience", "skills", "education", "projects", "summary", "about",
"опыт работы", "навыки", "образование", "проекты", "о себе",
]
def _which_pdftotext() -> Optional[str]:
exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
return exe
def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
exe = _which_pdftotext()
if not exe:
return ""
cmd = [exe]
if layout:
cmd.append("-layout")
cmd += ["-nopgbrk", str(path), "-"]
try:
p = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout_sec,
check=False,
text=True,
encoding="utf-8",
errors="ignore",
)
return (p.stdout or "").strip()
except Exception:
return ""
def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
if PdfReader is None:
return []
try:
reader = PdfReader(str(path), strict=False)
except Exception:
return []
pages: List[dict] = []
for i, page in enumerate(getattr(reader, "pages", [])):
if max_pages and i >= max_pages:
break
try:
text = page.extract_text() or ""
except Exception:
text = ""
pages.append({"page": i + 1, "text": text})
return pages
def _extract_pdfminer(path: Path) -> str:
if pdfminer_extract_text is None:
return ""
try:
return (pdfminer_extract_text(str(path)) or "").strip()
except Exception:
return ""
def _quality_score(text: str) -> Tuple[float, List[str]]:
flags: List[str] = []
if not text:
return 0.0, ["empty"]
total = len(text)
letters = sum(ch.isalpha() for ch in text)
spaces = text.count(" ")
alpha_ratio = letters / max(1, total)
space_ratio = spaces / max(1, total)
words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
lines = [ln for ln in text.splitlines() if ln.strip()]
long_lines = [ln for ln in lines if len(ln) > 200]
long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
score = 0.0
if alpha_ratio >= 0.45:
score += 2.0
elif alpha_ratio >= 0.30:
score += 1.0
else:
flags.append("low_alpha")
if 0.10 <= space_ratio <= 0.28:
score += 1.0
else:
flags.append("odd_spacing")
if 3.5 <= avg_word_len <= 9.0:
score += 1.0
else:
flags.append("odd_word_len")
if long_line_ratio <= 0.06:
score += 1.0
else:
flags.append("long_lines")
if glued_hits <= 6:
score += 1.0
else:
flags.append("glued_text")
if section_hits >= 2:
score += 1.0
elif section_hits == 1:
score += 0.5
if total < 200:
flags.append("short_text")
if alpha_ratio < 0.08 or total < 120:
flags.append("scan_like")
return score, flags
def deglue_text(text: str) -> str:
if not text:
return text
t = text
t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
return t
def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
candidates: List[Tuple[str, str]] = []
txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
if txt_layout:
candidates.append(("pdftotext_layout", txt_layout))
txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
if txt_plain:
candidates.append(("pdftotext_plain", txt_plain))
txt_pypdf = ""
if PdfReader is not None:
pages = _extract_pages_pypdf(path)
if pages:
txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
if txt_pypdf:
candidates.append(("pypdf", txt_pypdf))
txt_pdfminer = _extract_pdfminer(path)
if txt_pdfminer:
candidates.append(("pdfminer", txt_pdfminer))
if not candidates:
return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
best_method = "none"
best_text = ""
best_score = -1.0
best_flags: List[str] = []
for method, text in candidates:
score, flags = _quality_score(text)
if score > best_score:
best_score = score
best_method = method
best_text = text
best_flags = flags
pages = _extract_pages_pypdf(path)
best_text = deglue_text(best_text)
return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)