Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

41
dedup/simhash.py Normal file
View File

@@ -0,0 +1,41 @@
from __future__ import annotations
import hashlib
import re
from typing import List, Tuple
def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str:
h = hashlib.sha256()
for chunk in iter(lambda: f.read(chunk_size), b""):
h.update(chunk)
return h.hexdigest()
def sha256_file(path) -> str:
with open(path, "rb") as f:
return sha256_file_bytes_iter(f)
def sha1_str(s: str) -> str:
return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()
def simhash64(text: str) -> int:
tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower())
if not tokens:
return 0
v = [0] * 64
for tok in tokens:
h = hashlib.md5(tok.encode("utf-8")).digest()
x = int.from_bytes(h[:8], "big", signed=False)
for i in range(64):
v[i] += 1 if ((x >> i) & 1) else -1
out = 0
for i in range(64):
if v[i] > 0:
out |= (1 << i)
return out
def hamming64(a: int, b: int) -> int:
return (a ^ b).bit_count()
def simhash_bands(x: int) -> List[Tuple[int, int]]:
# 4 bands x 16 bits
return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]