from __future__ import annotations import sqlite3 from pathlib import Path SCHEMA = r""" PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL; PRAGMA temp_store=MEMORY; CREATE TABLE IF NOT EXISTS candidates ( candidate_id TEXT PRIMARY KEY, name TEXT, location TEXT, remote INTEGER, experience_years REAL, experience_years_eng REAL, -- инженерный опыт (после фильтра HR) experience_confidence REAL, salary_min INTEGER, salary_max INTEGER, salary_confidence REAL, english_level TEXT, roles_json TEXT, skills_json TEXT, primary_languages_json TEXT, backend_focus INTEGER, roles_norm TEXT, -- "|backend|devops|" skills_norm TEXT, -- "|python|k8s|" created_at TEXT DEFAULT (datetime('now')), updated_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS candidate_contacts ( contact_type TEXT NOT NULL, -- email/phone/tg/github/linkedin/url contact_value TEXT NOT NULL, -- normalized candidate_id TEXT NOT NULL, created_at TEXT DEFAULT (datetime('now')), PRIMARY KEY(contact_type, contact_value), FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) ); CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id); CREATE TABLE IF NOT EXISTS resumes ( resume_id TEXT PRIMARY KEY, candidate_id TEXT NOT NULL, sha256 TEXT, simhash TEXT, clean_text TEXT NOT NULL, raw_text TEXT, extraction_json TEXT, llm_summary TEXT, llm_tags_json TEXT, extract_method TEXT, extract_quality_score REAL, extract_quality_flags TEXT, extract_pages_json TEXT, doc_type TEXT, doc_type_confidence REAL, parse_method TEXT, parse_version TEXT, sections_json TEXT, is_active INTEGER DEFAULT 1, duplicate_of_resume_id TEXT, file_path TEXT, file_mtime INTEGER, file_size INTEGER, created_at TEXT DEFAULT (datetime('now')), FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) ); CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id); CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active); CREATE TABLE IF NOT EXISTS sources ( source_id INTEGER PRIMARY KEY AUTOINCREMENT, resume_id TEXT NOT NULL, export_path TEXT, chat_title TEXT, message_id TEXT, message_date TEXT, origin_type TEXT, original_file_path TEXT, original_file_name TEXT, extra_json TEXT, created_at TEXT DEFAULT (datetime('now')), FOREIGN KEY(resume_id) REFERENCES resumes(resume_id) ); CREATE TABLE IF NOT EXISTS files_seen ( sha256 TEXT PRIMARY KEY, size INTEGER, mtime INTEGER, canonical_resume_id TEXT, first_seen_at TEXT DEFAULT (datetime('now')), last_seen_at TEXT DEFAULT (datetime('now')) ); CREATE TABLE IF NOT EXISTS simhash_buckets ( bucket INTEGER NOT NULL, band INTEGER NOT NULL, resume_id TEXT NOT NULL, PRIMARY KEY(bucket, band, resume_id), FOREIGN KEY(resume_id) REFERENCES resumes(resume_id) ); CREATE TABLE IF NOT EXISTS candidate_skills ( candidate_id TEXT NOT NULL, skill_id TEXT NOT NULL, skill_label TEXT, confidence REAL, source TEXT, evidence TEXT, created_at TEXT DEFAULT (datetime('now')), PRIMARY KEY(candidate_id, skill_id), FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) ); CREATE TABLE IF NOT EXISTS candidate_roles ( candidate_id TEXT NOT NULL, role TEXT NOT NULL, confidence REAL, source TEXT, evidence TEXT, created_at TEXT DEFAULT (datetime('now')), PRIMARY KEY(candidate_id, role), FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) ); CREATE TABLE IF NOT EXISTS candidate_languages ( candidate_id TEXT NOT NULL, language TEXT NOT NULL, level TEXT, confidence REAL, source TEXT, evidence TEXT, created_at TEXT DEFAULT (datetime('now')), PRIMARY KEY(candidate_id, language), FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) ); CREATE TABLE IF NOT EXISTS positions ( position_id TEXT PRIMARY KEY, resume_id TEXT NOT NULL, candidate_id TEXT NOT NULL, title TEXT, company TEXT, date_from TEXT, date_to TEXT, is_current INTEGER, description TEXT, stack_json TEXT, created_at TEXT DEFAULT (datetime('now')), FOREIGN KEY(resume_id) REFERENCES resumes(resume_id), FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id) ); CREATE TABLE IF NOT EXISTS llm_cache ( cache_key TEXT PRIMARY KEY, model TEXT, result_json TEXT, created_at TEXT DEFAULT (datetime('now')) ); -- Full-text index (FTS5): contentless CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5( resume_id UNINDEXED, candidate_id UNINDEXED, clean_text, tokenize='unicode61 remove_diacritics 2' ); -- --- Triggers to keep FTS synced with resumes --- -- Insert CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts AFTER INSERT ON resumes BEGIN DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id; INSERT INTO resumes_fts(resume_id, candidate_id, clean_text) SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text WHERE NEW.is_active = 1; END; -- Delete CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts AFTER DELETE ON resumes BEGIN DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id; END; -- Update (text/active/candidate) CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes BEGIN DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id; INSERT INTO resumes_fts(resume_id, candidate_id, clean_text) SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text WHERE NEW.is_active = 1; END; """ def connect(db_path: str) -> sqlite3.Connection: Path(db_path).parent.mkdir(parents=True, exist_ok=True) con = sqlite3.connect(db_path) con.row_factory = sqlite3.Row return con def _table_exists(con: sqlite3.Connection, name: str) -> bool: row = con.execute( "SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1", (name,), ).fetchone() return row is not None def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool: cur = con.execute(f"PRAGMA table_info({table})") for r in cur.fetchall(): if r["name"] == column: return True return False def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None: if not _table_exists(con, table): return if _column_exists(con, table, column): return con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}") def _ensure_fts_backfilled(con: sqlite3.Connection) -> None: """ Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes. Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0. """ if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"): return try: resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"]) fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"]) except Exception: return if resumes_cnt <= 0: return # Любое несовпадение -> rebuild (убирает и пустоту, и дубли) if fts_cnt != resumes_cnt: con.execute("DELETE FROM resumes_fts") con.execute( """ INSERT INTO resumes_fts(resume_id, candidate_id, clean_text) SELECT resume_id, candidate_id, clean_text FROM resumes WHERE is_active=1 """ ) con.commit() def init_db(con: sqlite3.Connection) -> None: con.executescript(SCHEMA) # Lightweight migrations for existing DBs (safe to re-run) _add_column_if_missing(con, "candidates", "experience_years_eng", "REAL") _add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT") _add_column_if_missing(con, "candidates", "backend_focus", "INTEGER") _add_column_if_missing(con, "resumes", "llm_summary", "TEXT") _add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT") _add_column_if_missing(con, "resumes", "extract_method", "TEXT") _add_column_if_missing(con, "resumes", "extract_quality_score", "REAL") _add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT") _add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT") _add_column_if_missing(con, "resumes", "doc_type", "TEXT") _add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL") _add_column_if_missing(con, "resumes", "parse_method", "TEXT") _add_column_if_missing(con, "resumes", "parse_version", "TEXT") _add_column_if_missing(con, "resumes", "sections_json", "TEXT") if not _table_exists(con, "llm_cache"): con.execute( """ CREATE TABLE IF NOT EXISTS llm_cache ( cache_key TEXT PRIMARY KEY, model TEXT, result_json TEXT, created_at TEXT DEFAULT (datetime('now')) ) """ ) con.commit() _ensure_fts_backfilled(con)