Initial commit
This commit is contained in:
296
db.py
Normal file
296
db.py
Normal file
@@ -0,0 +1,296 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SCHEMA = r"""
|
||||
PRAGMA journal_mode=WAL;
|
||||
PRAGMA synchronous=NORMAL;
|
||||
PRAGMA temp_store=MEMORY;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS candidates (
|
||||
candidate_id TEXT PRIMARY KEY,
|
||||
name TEXT,
|
||||
location TEXT,
|
||||
remote INTEGER,
|
||||
experience_years REAL,
|
||||
experience_years_eng REAL, -- инженерный опыт (после фильтра HR)
|
||||
experience_confidence REAL,
|
||||
salary_min INTEGER,
|
||||
salary_max INTEGER,
|
||||
salary_confidence REAL,
|
||||
english_level TEXT,
|
||||
roles_json TEXT,
|
||||
skills_json TEXT,
|
||||
primary_languages_json TEXT,
|
||||
backend_focus INTEGER,
|
||||
roles_norm TEXT, -- "|backend|devops|"
|
||||
skills_norm TEXT, -- "|python|k8s|"
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
updated_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS candidate_contacts (
|
||||
contact_type TEXT NOT NULL, -- email/phone/tg/github/linkedin/url
|
||||
contact_value TEXT NOT NULL, -- normalized
|
||||
candidate_id TEXT NOT NULL,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
PRIMARY KEY(contact_type, contact_value),
|
||||
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_contacts_candidate ON candidate_contacts(candidate_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resumes (
|
||||
resume_id TEXT PRIMARY KEY,
|
||||
candidate_id TEXT NOT NULL,
|
||||
sha256 TEXT,
|
||||
simhash TEXT,
|
||||
clean_text TEXT NOT NULL,
|
||||
raw_text TEXT,
|
||||
extraction_json TEXT,
|
||||
llm_summary TEXT,
|
||||
llm_tags_json TEXT,
|
||||
extract_method TEXT,
|
||||
extract_quality_score REAL,
|
||||
extract_quality_flags TEXT,
|
||||
extract_pages_json TEXT,
|
||||
doc_type TEXT,
|
||||
doc_type_confidence REAL,
|
||||
parse_method TEXT,
|
||||
parse_version TEXT,
|
||||
sections_json TEXT,
|
||||
is_active INTEGER DEFAULT 1,
|
||||
duplicate_of_resume_id TEXT,
|
||||
file_path TEXT,
|
||||
file_mtime INTEGER,
|
||||
file_size INTEGER,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||
);
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_resumes_sha ON resumes(sha256) WHERE sha256 IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_resumes_candidate ON resumes(candidate_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_resumes_active ON resumes(is_active);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
source_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
resume_id TEXT NOT NULL,
|
||||
export_path TEXT,
|
||||
chat_title TEXT,
|
||||
message_id TEXT,
|
||||
message_date TEXT,
|
||||
origin_type TEXT,
|
||||
original_file_path TEXT,
|
||||
original_file_name TEXT,
|
||||
extra_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS files_seen (
|
||||
sha256 TEXT PRIMARY KEY,
|
||||
size INTEGER,
|
||||
mtime INTEGER,
|
||||
canonical_resume_id TEXT,
|
||||
first_seen_at TEXT DEFAULT (datetime('now')),
|
||||
last_seen_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS simhash_buckets (
|
||||
bucket INTEGER NOT NULL,
|
||||
band INTEGER NOT NULL,
|
||||
resume_id TEXT NOT NULL,
|
||||
PRIMARY KEY(bucket, band, resume_id),
|
||||
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS candidate_skills (
|
||||
candidate_id TEXT NOT NULL,
|
||||
skill_id TEXT NOT NULL,
|
||||
skill_label TEXT,
|
||||
confidence REAL,
|
||||
source TEXT,
|
||||
evidence TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
PRIMARY KEY(candidate_id, skill_id),
|
||||
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS candidate_roles (
|
||||
candidate_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL,
|
||||
confidence REAL,
|
||||
source TEXT,
|
||||
evidence TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
PRIMARY KEY(candidate_id, role),
|
||||
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS candidate_languages (
|
||||
candidate_id TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
level TEXT,
|
||||
confidence REAL,
|
||||
source TEXT,
|
||||
evidence TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
PRIMARY KEY(candidate_id, language),
|
||||
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS positions (
|
||||
position_id TEXT PRIMARY KEY,
|
||||
resume_id TEXT NOT NULL,
|
||||
candidate_id TEXT NOT NULL,
|
||||
title TEXT,
|
||||
company TEXT,
|
||||
date_from TEXT,
|
||||
date_to TEXT,
|
||||
is_current INTEGER,
|
||||
description TEXT,
|
||||
stack_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
FOREIGN KEY(resume_id) REFERENCES resumes(resume_id),
|
||||
FOREIGN KEY(candidate_id) REFERENCES candidates(candidate_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS llm_cache (
|
||||
cache_key TEXT PRIMARY KEY,
|
||||
model TEXT,
|
||||
result_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
-- Full-text index (FTS5): contentless
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS resumes_fts USING fts5(
|
||||
resume_id UNINDEXED,
|
||||
candidate_id UNINDEXED,
|
||||
clean_text,
|
||||
tokenize='unicode61 remove_diacritics 2'
|
||||
);
|
||||
|
||||
-- --- Triggers to keep FTS synced with resumes ---
|
||||
-- Insert
|
||||
CREATE TRIGGER IF NOT EXISTS trg_resumes_ai_fts
|
||||
AFTER INSERT ON resumes
|
||||
BEGIN
|
||||
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
|
||||
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
||||
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
|
||||
WHERE NEW.is_active = 1;
|
||||
END;
|
||||
|
||||
-- Delete
|
||||
CREATE TRIGGER IF NOT EXISTS trg_resumes_ad_fts
|
||||
AFTER DELETE ON resumes
|
||||
BEGIN
|
||||
DELETE FROM resumes_fts WHERE resume_id = OLD.resume_id;
|
||||
END;
|
||||
|
||||
-- Update (text/active/candidate)
|
||||
CREATE TRIGGER IF NOT EXISTS trg_resumes_au_fts
|
||||
AFTER UPDATE OF clean_text, is_active, candidate_id ON resumes
|
||||
BEGIN
|
||||
DELETE FROM resumes_fts WHERE resume_id = NEW.resume_id;
|
||||
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
||||
SELECT NEW.resume_id, NEW.candidate_id, NEW.clean_text
|
||||
WHERE NEW.is_active = 1;
|
||||
END;
|
||||
"""
|
||||
|
||||
|
||||
def connect(db_path: str) -> sqlite3.Connection:
|
||||
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
con = sqlite3.connect(db_path)
|
||||
con.row_factory = sqlite3.Row
|
||||
return con
|
||||
|
||||
|
||||
def _table_exists(con: sqlite3.Connection, name: str) -> bool:
|
||||
row = con.execute(
|
||||
"SELECT 1 FROM sqlite_master WHERE type IN ('table','view') AND name=? LIMIT 1",
|
||||
(name,),
|
||||
).fetchone()
|
||||
return row is not None
|
||||
|
||||
|
||||
def _column_exists(con: sqlite3.Connection, table: str, column: str) -> bool:
|
||||
cur = con.execute(f"PRAGMA table_info({table})")
|
||||
for r in cur.fetchall():
|
||||
if r["name"] == column:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _add_column_if_missing(con: sqlite3.Connection, table: str, column: str, ddl_type: str) -> None:
|
||||
if not _table_exists(con, table):
|
||||
return
|
||||
if _column_exists(con, table, column):
|
||||
return
|
||||
con.execute(f"ALTER TABLE {table} ADD COLUMN {column} {ddl_type}")
|
||||
|
||||
|
||||
def _ensure_fts_backfilled(con: sqlite3.Connection) -> None:
|
||||
"""
|
||||
Если resumes_fts пустая / рассинхронизирована - пересобираем из resumes.
|
||||
Это лечит ситуацию: init_db создал FTS, но данные туда никто не залил => search всегда 0.
|
||||
"""
|
||||
if not _table_exists(con, "resumes") or not _table_exists(con, "resumes_fts"):
|
||||
return
|
||||
|
||||
try:
|
||||
resumes_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes WHERE is_active=1").fetchone()["c"])
|
||||
fts_cnt = int(con.execute("SELECT COUNT(*) AS c FROM resumes_fts").fetchone()["c"])
|
||||
except Exception:
|
||||
return
|
||||
|
||||
if resumes_cnt <= 0:
|
||||
return
|
||||
|
||||
# Любое несовпадение -> rebuild (убирает и пустоту, и дубли)
|
||||
if fts_cnt != resumes_cnt:
|
||||
con.execute("DELETE FROM resumes_fts")
|
||||
con.execute(
|
||||
"""
|
||||
INSERT INTO resumes_fts(resume_id, candidate_id, clean_text)
|
||||
SELECT resume_id, candidate_id, clean_text
|
||||
FROM resumes
|
||||
WHERE is_active=1
|
||||
"""
|
||||
)
|
||||
con.commit()
|
||||
|
||||
|
||||
def init_db(con: sqlite3.Connection) -> None:
|
||||
con.executescript(SCHEMA)
|
||||
# Lightweight migrations for existing DBs (safe to re-run)
|
||||
_add_column_if_missing(con, "candidates", "experience_years_eng", "REAL")
|
||||
_add_column_if_missing(con, "candidates", "primary_languages_json", "TEXT")
|
||||
_add_column_if_missing(con, "candidates", "backend_focus", "INTEGER")
|
||||
_add_column_if_missing(con, "resumes", "llm_summary", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "llm_tags_json", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "extract_method", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "extract_quality_score", "REAL")
|
||||
_add_column_if_missing(con, "resumes", "extract_quality_flags", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "extract_pages_json", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "doc_type", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "doc_type_confidence", "REAL")
|
||||
_add_column_if_missing(con, "resumes", "parse_method", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "parse_version", "TEXT")
|
||||
_add_column_if_missing(con, "resumes", "sections_json", "TEXT")
|
||||
if not _table_exists(con, "llm_cache"):
|
||||
con.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS llm_cache (
|
||||
cache_key TEXT PRIMARY KEY,
|
||||
model TEXT,
|
||||
result_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
)
|
||||
"""
|
||||
)
|
||||
con.commit()
|
||||
_ensure_fts_backfilled(con)
|
||||
Reference in New Issue
Block a user