Initial commit
This commit is contained in:
39
extract/clean.py
Normal file
39
extract/clean.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
import unicodedata
|
||||
|
||||
RE_PAGE = re.compile(r"^\s*(page|стр(аница)?)\s*\d+\s*(/|\s+of\s+)\s*\d+\s*$", re.I)
|
||||
RE_ONLY_PUNCT = re.compile(r"^[\W_]+$", re.U)
|
||||
RE_MULTI_SPACE = re.compile(r"[ \t]+")
|
||||
RE_MULTI_NL = re.compile(r"\n{3,}")
|
||||
|
||||
_INVISIBLE_CHARS = ["\u00ad", "\u200b", "\u200c", "\u200d", "\ufeff"]
|
||||
_BIDI_CTRL_RE = re.compile(r"[\u202a-\u202e\u2060\u2066-\u2069\ufffe\uffff]")
|
||||
|
||||
def normalize_text(raw: str) -> str:
|
||||
text = raw.replace("\r\n", "\n").replace("\r", "\n")
|
||||
for ch in _INVISIBLE_CHARS:
|
||||
text = text.replace(ch, "")
|
||||
text = _BIDI_CTRL_RE.sub("", text)
|
||||
# remove most control/format chars but keep line breaks and tabs
|
||||
text = "".join(
|
||||
ch for ch in text
|
||||
if ch in ("\n", "\t") or not unicodedata.category(ch).startswith("C")
|
||||
)
|
||||
text = "".join(ch for ch in text if ch == "\n" or (ch.isprintable() and ch != "\x0b"))
|
||||
lines = [RE_MULTI_SPACE.sub(" ", ln).strip() for ln in text.split("\n")]
|
||||
lines = [ln for ln in lines if ln and not RE_PAGE.match(ln) and not RE_ONLY_PUNCT.match(ln)]
|
||||
counts = Counter(lines)
|
||||
filtered = []
|
||||
for ln in lines:
|
||||
if counts[ln] >= 4 and len(ln) <= 90:
|
||||
continue
|
||||
filtered.append(ln)
|
||||
text = "\n".join(filtered)
|
||||
text = RE_MULTI_NL.sub("\n\n", text).strip()
|
||||
return text
|
||||
|
||||
def to_fts_text(clean: str) -> str:
|
||||
return re.sub(r"\s+", " ", clean).strip()
|
||||
Reference in New Issue
Block a user