from __future__ import annotations import hashlib import re from typing import List, Tuple def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str: h = hashlib.sha256() for chunk in iter(lambda: f.read(chunk_size), b""): h.update(chunk) return h.hexdigest() def sha256_file(path) -> str: with open(path, "rb") as f: return sha256_file_bytes_iter(f) def sha1_str(s: str) -> str: return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest() def simhash64(text: str) -> int: tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower()) if not tokens: return 0 v = [0] * 64 for tok in tokens: h = hashlib.md5(tok.encode("utf-8")).digest() x = int.from_bytes(h[:8], "big", signed=False) for i in range(64): v[i] += 1 if ((x >> i) & 1) else -1 out = 0 for i in range(64): if v[i] > 0: out |= (1 << i) return out def hamming64(a: int, b: int) -> int: return (a ^ b).bit_count() def simhash_bands(x: int) -> List[Tuple[int, int]]: # 4 bands x 16 bits return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]