297 lines
9.0 KiB
Python
297 lines
9.0 KiB
Python
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
|
|
SCHEMA = r"""
|
|
PRAGMA journal_mode=WAL;
|
|
PRAGMA synchronous=NORMAL;
|
|
PRAGMA temp_store=MEMORY;
|
|
|
|
CREATE TABLE IF NOT EXISTS candidates (
|
|
candidate_id TEXT PRIMARY KEY,
|
|
name TEXT,
|
|
location TEXT,
|
|
remote INTEGER,
|
|
experience_years REAL,
|
|
experience_years_eng REAL, -- инженерный опыт (после фильтра HR)
|
|
experience_confidence REAL,
|
|
salary_min INTEGER,
|
|
salary_max INTEGER,
|
|
salary_confidence REAL,
|
|
english_level TEXT,
|
|
roles_json TEXT,
|
|
skills_json TEXT,
|
|
primary_languages_json TEXT,
|
|
backend_focus INTEGER,
|
|
roles_norm TEXT, -- "|backend|devops|"
|
|
skills_norm TEXT, -- "|python|k8s|"
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
updated_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS candidate_contacts (
|
|
contact_type TEXT NOT NULL, -- email/phone/tg/github/linkedin/url
|
|
contact_value TEXT NOT NULL, -- normalized
|
|
candidate_id TEXT NOT NULL,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
PRIMARY KEY(contact_type, contact_value),
|
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id);
|
|
|
|
CREATE TABLE IF NOT EXISTS resumes (
|
|
resume_id TEXT PRIMARY KEY,
|
|
candidate_id TEXT NOT NULL,
|
|
sha256 TEXT,
|
|
simhash TEXT,
|
|
clean_text TEXT NOT NULL,
|
|
raw_text TEXT,
|
|
extraction_json TEXT,
|
|
llm_summary TEXT,
|
|
llm_tags_json TEXT,
|
|
extract_method TEXT,
|
|
extract_quality_score REAL,
|
|
extract_quality_flags TEXT,
|
|
extract_pages_json TEXT,
|
|
doc_type TEXT,
|
|
doc_type_confidence REAL,
|
|
parse_method TEXT,
|
|
parse_version TEXT,
|
|
sections_json TEXT,
|
|
is_active INTEGER DEFAULT 1,
|
|
duplicate_of_resume_id TEXT,
|
|
file_path TEXT,
|
|
file_mtime INTEGER,
|
|
file_size INTEGER,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
|
);
|
|
|
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL;
|
|
CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id);
|
|
CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active);
|
|
|
|
CREATE TABLE IF NOT EXISTS sources (
|
|
source_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
resume_id TEXT NOT NULL,
|
|
export_path TEXT,
|
|
chat_title TEXT,
|
|
message_id TEXT,
|
|
message_date TEXT,
|
|
origin_type TEXT,
|
|
original_file_path TEXT,
|
|
original_file_name TEXT,
|
|
extra_json TEXT,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS files_seen (
|
|
sha256 TEXT PRIMARY KEY,
|
|
size INTEGER,
|
|
mtime INTEGER,
|
|
canonical_resume_id TEXT,
|
|
first_seen_at TEXT DEFAULT (datetime('now')),
|
|
last_seen_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS simhash_buckets (
|
|
bucket INTEGER NOT NULL,
|
|
band INTEGER NOT NULL,
|
|
resume_id TEXT NOT NULL,
|
|
PRIMARY KEY(bucket, band, resume_id),
|
|
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS candidate_skills (
|
|
candidate_id TEXT NOT NULL,
|
|
skill_id TEXT NOT NULL,
|
|
skill_label TEXT,
|
|
confidence REAL,
|
|
source TEXT,
|
|
evidence TEXT,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
PRIMARY KEY(candidate_id, skill_id),
|
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS candidate_roles (
|
|
candidate_id TEXT NOT NULL,
|
|
role TEXT NOT NULL,
|
|
confidence REAL,
|
|
source TEXT,
|
|
evidence TEXT,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
PRIMARY KEY(candidate_id, role),
|
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS candidate_languages (
|
|
candidate_id TEXT NOT NULL,
|
|
language TEXT NOT NULL,
|
|
level TEXT,
|
|
confidence REAL,
|
|
source TEXT,
|
|
evidence TEXT,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
PRIMARY KEY(candidate_id, language),
|
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS positions (
|
|
position_id TEXT PRIMARY KEY,
|
|
resume_id TEXT NOT NULL,
|
|
candidate_id TEXT NOT NULL,
|
|
title TEXT,
|
|
company TEXT,
|
|
date_from TEXT,
|
|
date_to TEXT,
|
|
is_current INTEGER,
|
|
description TEXT,
|
|
stack_json TEXT,
|
|
created_at TEXT DEFAULT (datetime('now')),
|
|
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id),
|
|
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS llm_cache (
|
|
cache_key TEXT PRIMARY KEY,
|
|
model TEXT,
|
|
result_json TEXT,
|
|
created_at TEXT DEFAULT (datetime('now'))
|
|
);
|
|
|
|
-- Full-text index (FTS5): contentless
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5(
|
|
resume_id UNINDEXED,
|
|
candidate_id UNINDEXED,
|
|
clean_text,
|
|
tokenize='unicode61 remove_diacritics 2'
|
|
);
|
|
|
|
-- --- Triggers to keep FTS synced with resumes ---
|
|
-- Insert
|
|
CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts
|
|
AFTER INSERT ON resumes
|
|
BEGIN
|
|
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
|
|
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
|
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
|
|
WHERE NEW.is_active = 1;
|
|
END;
|
|
|
|
-- Delete
|
|
CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts
|
|
AFTER DELETE ON resumes
|
|
BEGIN
|
|
DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id;
|
|
END;
|
|
|
|
-- Update (text/active/candidate)
|
|
CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts
|
|
AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes
|
|
BEGIN
|
|
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
|
|
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
|
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
|
|
WHERE NEW.is_active = 1;
|
|
END;
|
|
"""
|
|
|
|
|
|
def connect(db_path: str) -> sqlite3.Connection:
|
|
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
con = sqlite3.connect(db_path)
|
|
con.row_factory = sqlite3.Row
|
|
return con
|
|
|
|
|
|
def _table_exists(con: sqlite3.Connection, name: str) -> bool:
|
|
row = con.execute(
|
|
"SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1",
|
|
(name,),
|
|
).fetchone()
|
|
return row is not None
|
|
|
|
|
|
def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool:
|
|
cur = con.execute(f"PRAGMA table_info({table})")
|
|
for r in cur.fetchall():
|
|
if r["name"] == column:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None:
|
|
if not _table_exists(con, table):
|
|
return
|
|
if _column_exists(con, table, column):
|
|
return
|
|
con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}")
|
|
|
|
|
|
def _ensure_fts_backfilled(con: sqlite3.Connection) -> None:
|
|
"""
|
|
Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes.
|
|
Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0.
|
|
"""
|
|
if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"):
|
|
return
|
|
|
|
try:
|
|
resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"])
|
|
fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"])
|
|
except Exception:
|
|
return
|
|
|
|
if resumes_cnt <= 0:
|
|
return
|
|
|
|
# Любое несовпадение -> rebuild (убирает и пустоту, и дубли)
|
|
if fts_cnt != resumes_cnt:
|
|
con.execute("DELETE FROM resumes_fts")
|
|
con.execute(
|
|
"""
|
|
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
|
SELECT resume_id, candidate_id, clean_text
|
|
FROM resumes
|
|
WHERE is_active=1
|
|
"""
|
|
)
|
|
con.commit()
|
|
|
|
|
|
def init_db(con: sqlite3.Connection) -> None:
|
|
con.executescript(SCHEMA)
|
|
# Lightweight migrations for existing DBs (safe to re-run)
|
|
_add_column_if_missing(con, "candidates", "experience_years_eng", "REAL")
|
|
_add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT")
|
|
_add_column_if_missing(con, "candidates", "backend_focus", "INTEGER")
|
|
_add_column_if_missing(con, "resumes", "llm_summary", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "extract_method", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "extract_quality_score", "REAL")
|
|
_add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "doc_type", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL")
|
|
_add_column_if_missing(con, "resumes", "parse_method", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "parse_version", "TEXT")
|
|
_add_column_if_missing(con, "resumes", "sections_json", "TEXT")
|
|
if not _table_exists(con, "llm_cache"):
|
|
con.execute(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS llm_cache (
|
|
cache_key TEXT PRIMARY KEY,
|
|
model TEXT,
|
|
result_json TEXT,
|
|
created_at TEXT DEFAULT (datetime('now'))
|
|
)
|
|
"""
|
|
)
|
|
con.commit()
|
|
_ensure_fts_backfilled(con)
|