Files
tg_resume_db/db.py
2026-03-11 15:27:10 +03:00

297 lines
9.0 KiB
Python

from __future__ import annotations
import sqlite3
from pathlib import Path
SCHEMA = r"""
PRAGMA journal_mode=WAL;
PRAGMA synchronous=NORMAL;
PRAGMA temp_store=MEMORY;
CREATE TABLE IF NOT EXISTS candidates (
candidate_id TEXT PRIMARY KEY,
name TEXT,
location TEXT,
remote INTEGER,
experience_years REAL,
experience_years_eng REAL, -- инженерный опыт (после фильтра HR)
experience_confidence REAL,
salary_min INTEGER,
salary_max INTEGER,
salary_confidence REAL,
english_level TEXT,
roles_json TEXT,
skills_json TEXT,
primary_languages_json TEXT,
backend_focus INTEGER,
roles_norm TEXT, -- "|backend|devops|"
skills_norm TEXT, -- "|python|k8s|"
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS candidate_contacts (
contact_type TEXT NOT NULL, -- email/phone/tg/github/linkedin/url
contact_value TEXT NOT NULL, -- normalized
candidate_id TEXT NOT NULL,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(contact_type, contact_value),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id);
CREATE TABLE IF NOT EXISTS resumes (
resume_id TEXT PRIMARY KEY,
candidate_id TEXT NOT NULL,
sha256 TEXT,
simhash TEXT,
clean_text TEXT NOT NULL,
raw_text TEXT,
extraction_json TEXT,
llm_summary TEXT,
llm_tags_json TEXT,
extract_method TEXT,
extract_quality_score REAL,
extract_quality_flags TEXT,
extract_pages_json TEXT,
doc_type TEXT,
doc_type_confidence REAL,
parse_method TEXT,
parse_version TEXT,
sections_json TEXT,
is_active INTEGER DEFAULT 1,
duplicate_of_resume_id TEXT,
file_path TEXT,
file_mtime INTEGER,
file_size INTEGER,
created_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id);
CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active);
CREATE TABLE IF NOT EXISTS sources (
source_id INTEGER PRIMARY KEY AUTOINCREMENT,
resume_id TEXT NOT NULL,
export_path TEXT,
chat_title TEXT,
message_id TEXT,
message_date TEXT,
origin_type TEXT,
original_file_path TEXT,
original_file_name TEXT,
extra_json TEXT,
created_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
);
CREATE TABLE IF NOT EXISTS files_seen (
sha256 TEXT PRIMARY KEY,
size INTEGER,
mtime INTEGER,
canonical_resume_id TEXT,
first_seen_at TEXT DEFAULT (datetime('now')),
last_seen_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS simhash_buckets (
bucket INTEGER NOT NULL,
band INTEGER NOT NULL,
resume_id TEXT NOT NULL,
PRIMARY KEY(bucket, band, resume_id),
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
);
CREATE TABLE IF NOT EXISTS candidate_skills (
candidate_id TEXT NOT NULL,
skill_id TEXT NOT NULL,
skill_label TEXT,
confidence REAL,
source TEXT,
evidence TEXT,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(candidate_id, skill_id),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS candidate_roles (
candidate_id TEXT NOT NULL,
role TEXT NOT NULL,
confidence REAL,
source TEXT,
evidence TEXT,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(candidate_id, role),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS candidate_languages (
candidate_id TEXT NOT NULL,
language TEXT NOT NULL,
level TEXT,
confidence REAL,
source TEXT,
evidence TEXT,
created_at TEXT DEFAULT (datetime('now')),
PRIMARY KEY(candidate_id, language),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS positions (
position_id TEXT PRIMARY KEY,
resume_id TEXT NOT NULL,
candidate_id TEXT NOT NULL,
title TEXT,
company TEXT,
date_from TEXT,
date_to TEXT,
is_current INTEGER,
description TEXT,
stack_json TEXT,
created_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id),
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
);
CREATE TABLE IF NOT EXISTS llm_cache (
cache_key TEXT PRIMARY KEY,
model TEXT,
result_json TEXT,
created_at TEXT DEFAULT (datetime('now'))
);
-- Full-text index (FTS5): contentless
CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5(
resume_id UNINDEXED,
candidate_id UNINDEXED,
clean_text,
tokenize='unicode61 remove_diacritics 2'
);
-- --- Triggers to keep FTS synced with resumes ---
-- Insert
CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts
AFTER INSERT ON resumes
BEGIN
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
WHERE NEW.is_active = 1;
END;
-- Delete
CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts
AFTER DELETE ON resumes
BEGIN
DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id;
END;
-- Update (text/active/candidate)
CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts
AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes
BEGIN
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
WHERE NEW.is_active = 1;
END;
"""
def connect(db_path: str) -> sqlite3.Connection:
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
con = sqlite3.connect(db_path)
con.row_factory = sqlite3.Row
return con
def _table_exists(con: sqlite3.Connection, name: str) -> bool:
row = con.execute(
"SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1",
(name,),
).fetchone()
return row is not None
def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool:
cur = con.execute(f"PRAGMA table_info({table})")
for r in cur.fetchall():
if r["name"] == column:
return True
return False
def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None:
if not _table_exists(con, table):
return
if _column_exists(con, table, column):
return
con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}")
def _ensure_fts_backfilled(con: sqlite3.Connection) -> None:
"""
Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes.
Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0.
"""
if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"):
return
try:
resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"])
fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"])
except Exception:
return
if resumes_cnt <= 0:
return
# Любое несовпадение -> rebuild (убирает и пустоту, и дубли)
if fts_cnt != resumes_cnt:
con.execute("DELETE FROM resumes_fts")
con.execute(
"""
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
SELECT resume_id, candidate_id, clean_text
FROM resumes
WHERE is_active=1
"""
)
con.commit()
def init_db(con: sqlite3.Connection) -> None:
con.executescript(SCHEMA)
# Lightweight migrations for existing DBs (safe to re-run)
_add_column_if_missing(con, "candidates", "experience_years_eng", "REAL")
_add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT")
_add_column_if_missing(con, "candidates", "backend_focus", "INTEGER")
_add_column_if_missing(con, "resumes", "llm_summary", "TEXT")
_add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT")
_add_column_if_missing(con, "resumes", "extract_method", "TEXT")
_add_column_if_missing(con, "resumes", "extract_quality_score", "REAL")
_add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT")
_add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT")
_add_column_if_missing(con, "resumes", "doc_type", "TEXT")
_add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL")
_add_column_if_missing(con, "resumes", "parse_method", "TEXT")
_add_column_if_missing(con, "resumes", "parse_version", "TEXT")
_add_column_if_missing(con, "resumes", "sections_json", "TEXT")
if not _table_exists(con, "llm_cache"):
con.execute(
"""
CREATE TABLE IF NOT EXISTS llm_cache (
cache_key TEXT PRIMARY KEY,
model TEXT,
result_json TEXT,
created_at TEXT DEFAULT (datetime('now'))
)
"""
)
con.commit()
_ensure_fts_backfilled(con)