Initial commit

2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions
--- a/extract/pdf_extract.py
+++ b/extract/pdf_extract.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import re
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+try:  # optional dependency
+    from pypdf import PdfReader  # type: ignore
+except Exception:  # pragma: no cover
+    try:
+        from PyPDF2 import PdfReader  # type: ignore
+    except Exception:  # pragma: no cover
+        PdfReader = None  # type: ignore
+
+try:  # optional dependency
+    from pdfminer.high_level import extract_text as pdfminer_extract_text  # type: ignore
+except Exception:  # pragma: no cover
+    pdfminer_extract_text = None  # type: ignore
+
+
+@dataclass
+class PdfExtractResult:
+    text: str
+    pages: List[dict]
+    method: str
+    score: float
+    flags: List[str]
+
+
+_SECTION_HINTS = [
+    "experience", "work experience", "skills", "education", "projects", "summary", "about",
+    "опыт работы", "навыки", "образование", "проекты", "о себе",
+]
+
+
+def _which_pdftotext() -> Optional[str]:
+    exe = shutil.which("pdftotext") or shutil.which("pdftotext.exe")
+    return exe
+
+
+def _run_pdftotext(path: Path, *, layout: bool, timeout_sec: int = 25) -> str:
+    exe = _which_pdftotext()
+    if not exe:
+        return ""
+    cmd = [exe]
+    if layout:
+        cmd.append("-layout")
+    cmd += ["-nopgbrk", str(path), "-"]
+    try:
+        p = subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=timeout_sec,
+            check=False,
+            text=True,
+            encoding="utf-8",
+            errors="ignore",
+        )
+        return (p.stdout or "").strip()
+    except Exception:
+        return ""
+
+
+def _extract_pages_pypdf(path: Path, max_pages: int = 60) -> List[dict]:
+    if PdfReader is None:
+        return []
+    try:
+        reader = PdfReader(str(path), strict=False)
+    except Exception:
+        return []
+    pages: List[dict] = []
+    for i, page in enumerate(getattr(reader, "pages", [])):
+        if max_pages and i >= max_pages:
+            break
+        try:
+            text = page.extract_text() or ""
+        except Exception:
+            text = ""
+        pages.append({"page": i + 1, "text": text})
+    return pages
+
+
+def _extract_pdfminer(path: Path) -> str:
+    if pdfminer_extract_text is None:
+        return ""
+    try:
+        return (pdfminer_extract_text(str(path)) or "").strip()
+    except Exception:
+        return ""
+
+
+def _quality_score(text: str) -> Tuple[float, List[str]]:
+    flags: List[str] = []
+    if not text:
+        return 0.0, ["empty"]
+
+    total = len(text)
+    letters = sum(ch.isalpha() for ch in text)
+    spaces = text.count(" ")
+    alpha_ratio = letters / max(1, total)
+    space_ratio = spaces / max(1, total)
+
+    words = re.findall(r"[A-Za-zА-Яа-я0-9]+", text)
+    avg_word_len = (sum(len(w) for w in words) / max(1, len(words))) if words else 0.0
+
+    lines = [ln for ln in text.splitlines() if ln.strip()]
+    long_lines = [ln for ln in lines if len(ln) > 200]
+    long_line_ratio = (len(long_lines) / max(1, len(lines))) if lines else 0.0
+
+    glued_hits = len(re.findall(r"[a-zа-я][A-ZА-Я]|[A-Za-zА-Яа-я][0-9]|[0-9][A-Za-zА-Яа-я]", text))
+
+    section_hits = sum(1 for s in _SECTION_HINTS if s in text.lower())
+
+    score = 0.0
+    if alpha_ratio >= 0.45:
+        score += 2.0
+    elif alpha_ratio >= 0.30:
+        score += 1.0
+    else:
+        flags.append("low_alpha")
+
+    if 0.10 <= space_ratio <= 0.28:
+        score += 1.0
+    else:
+        flags.append("odd_spacing")
+
+    if 3.5 <= avg_word_len <= 9.0:
+        score += 1.0
+    else:
+        flags.append("odd_word_len")
+
+    if long_line_ratio <= 0.06:
+        score += 1.0
+    else:
+        flags.append("long_lines")
+
+    if glued_hits <= 6:
+        score += 1.0
+    else:
+        flags.append("glued_text")
+
+    if section_hits >= 2:
+        score += 1.0
+    elif section_hits == 1:
+        score += 0.5
+
+    if total < 200:
+        flags.append("short_text")
+
+    if alpha_ratio < 0.08 or total < 120:
+        flags.append("scan_like")
+
+    return score, flags
+
+
+def deglue_text(text: str) -> str:
+    if not text:
+        return text
+    t = text
+    t = re.sub(r"([a-zа-я])([A-ZА-Я])", r"\1 \2", t)
+    t = re.sub(r"([A-Za-zА-Яа-я])([0-9])", r"\1 \2", t)
+    t = re.sub(r"([0-9])([A-Za-zА-Яа-я])", r"\1 \2", t)
+    t = re.sub(r"([:;])([A-Za-zА-Яа-я])", r"\1 \2", t)
+    return t
+
+
+def extract_pdf_best(path: Path, timeout_sec: int = 25) -> PdfExtractResult:
+    candidates: List[Tuple[str, str]] = []
+
+    txt_layout = _run_pdftotext(path, layout=True, timeout_sec=timeout_sec)
+    if txt_layout:
+        candidates.append(("pdftotext_layout", txt_layout))
+
+    txt_plain = _run_pdftotext(path, layout=False, timeout_sec=timeout_sec)
+    if txt_plain:
+        candidates.append(("pdftotext_plain", txt_plain))
+
+    txt_pypdf = ""
+    if PdfReader is not None:
+        pages = _extract_pages_pypdf(path)
+        if pages:
+            txt_pypdf = "\n\n".join(p.get("text", "") for p in pages if p.get("text"))
+    if txt_pypdf:
+        candidates.append(("pypdf", txt_pypdf))
+
+    txt_pdfminer = _extract_pdfminer(path)
+    if txt_pdfminer:
+        candidates.append(("pdfminer", txt_pdfminer))
+
+    if not candidates:
+        return PdfExtractResult(text="", pages=[], method="none", score=0.0, flags=["empty"])
+
+    best_method = "none"
+    best_text = ""
+    best_score = -1.0
+    best_flags: List[str] = []
+    for method, text in candidates:
+        score, flags = _quality_score(text)
+        if score > best_score:
+            best_score = score
+            best_method = method
+            best_text = text
+            best_flags = flags
+
+    pages = _extract_pages_pypdf(path)
+    best_text = deglue_text(best_text)
+    return PdfExtractResult(text=best_text, pages=pages, method=best_method, score=best_score, flags=best_flags)