42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import re
|
|
from typing import List, Tuple
|
|
|
|
def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str:
|
|
h = hashlib.sha256()
|
|
for chunk in iter(lambda: f.read(chunk_size), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
def sha256_file(path) -> str:
|
|
with open(path, "rb") as f:
|
|
return sha256_file_bytes_iter(f)
|
|
|
|
def sha1_str(s: str) -> str:
|
|
return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()
|
|
|
|
def simhash64(text: str) -> int:
|
|
tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower())
|
|
if not tokens:
|
|
return 0
|
|
v = [0] * 64
|
|
for tok in tokens:
|
|
h = hashlib.md5(tok.encode("utf-8")).digest()
|
|
x = int.from_bytes(h[:8], "big", signed=False)
|
|
for i in range(64):
|
|
v[i] += 1 if ((x >> i) & 1) else -1
|
|
out = 0
|
|
for i in range(64):
|
|
if v[i] > 0:
|
|
out |= (1 << i)
|
|
return out
|
|
|
|
def hamming64(a: int, b: int) -> int:
|
|
return (a ^ b).bit_count()
|
|
|
|
def simhash_bands(x: int) -> List[Tuple[int, int]]:
|
|
# 4 bands x 16 bits
|
|
return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]
|