Initial commit
This commit is contained in:
41
dedup/simhash.py
Normal file
41
dedup/simhash.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
def sha256_file_bytes_iter(f, chunk_size: int = 1024 * 1024) -> str:
|
||||
h = hashlib.sha256()
|
||||
for chunk in iter(lambda: f.read(chunk_size), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
def sha256_file(path) -> str:
|
||||
with open(path, "rb") as f:
|
||||
return sha256_file_bytes_iter(f)
|
||||
|
||||
def sha1_str(s: str) -> str:
|
||||
return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()
|
||||
|
||||
def simhash64(text: str) -> int:
|
||||
tokens = re.findall(r"[a-zA-Z0-9_#+./-]{2,}", text.lower())
|
||||
if not tokens:
|
||||
return 0
|
||||
v = [0] * 64
|
||||
for tok in tokens:
|
||||
h = hashlib.md5(tok.encode("utf-8")).digest()
|
||||
x = int.from_bytes(h[:8], "big", signed=False)
|
||||
for i in range(64):
|
||||
v[i] += 1 if ((x >> i) & 1) else -1
|
||||
out = 0
|
||||
for i in range(64):
|
||||
if v[i] > 0:
|
||||
out |= (1 << i)
|
||||
return out
|
||||
|
||||
def hamming64(a: int, b: int) -> int:
|
||||
return (a ^ b).bit_count()
|
||||
|
||||
def simhash_bands(x: int) -> List[Tuple[int, int]]:
|
||||
# 4 bands x 16 bits
|
||||
return [(((x >> (band * 16)) & 0xFFFF), band) for band in range(4)]
|
||||
Reference in New Issue
Block a user