from __future__ import annotations import re from typing import Dict, List, Optional, Tuple _SKILL_SYNONYMS: Dict[str, List[str]] = { "python": ["py"], "javascript": ["js", "node", "nodejs", "java script", "java-script"], "typescript": ["ts", "type script", "type-script"], "postgresql": ["postgres", "psql"], "kubernetes": ["k8s"], "docker": [], "fastapi": [], "django": ["drf", "django rest framework"], "flask": [], "golang": ["go"], "c++": ["cpp"], "c#": ["csharp"], "redis": [], "kafka": [], "rabbitmq": [], "grpc": [], "rest": [], } _SKILL_STOP = {"rest", "http", "json", "xml", "oop"} _ROLE_SYNONYMS: Dict[str, List[str]] = { "backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"], "frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"], "fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"], "devops": ["sre", "site reliability"], "qa": ["tester", "тестировщик"], "data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"], "mobile": ["android", "ios", "mobile developer", "мобильный разработчик"], } def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]: alias = {} for canonical, al in src.items(): alias[canonical] = canonical for a in al: alias[a] = canonical return {k.lower(): v for k, v in alias.items()} _SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS) _ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS) def _normalize_skill_surface(token: str) -> str: t = (token or "").strip().lower() if not t: return "" t = t.replace("/", " ") t = re.sub(r"[_\-]+", " ", t) t = re.sub(r"\s+", " ", t).strip() # "java script", "type script", "postgre sql", "graph ql", "g rpc" t = re.sub(r"\bjava\s+script\b", "javascript", t) t = re.sub(r"\btype\s+script\b", "typescript", t) t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t) t = re.sub(r"\bgraph\s+ql\b", "graphql", t) t = re.sub(r"\bg\s+rpc\b", "grpc", t) t = re.sub(r"\bdocker\s+compose\b", "docker compose", t) return t def normalize_skill(token: str) -> Optional[str]: t = _normalize_skill_surface(token) if not t: return None # Avoid false-positive java from "javascript" if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)): return "javascript" return _SKILL_ALIAS.get(t, t) def normalize_skills(skills: List[str]) -> List[str]: out: List[str] = [] seen = set() for s in skills or []: canon = normalize_skill(s) if not canon or canon in seen: continue seen.add(canon) out.append(canon) return out def normalize_role(token: str) -> Optional[str]: t = (token or "").strip().lower() if not t: return None return _ROLE_ALIAS.get(t, t) def normalize_roles(roles: List[str]) -> List[str]: out: List[str] = [] seen = set() for r in roles or []: canon = normalize_role(r) if not canon or canon in seen: continue seen.add(canon) out.append(canon) return out def split_skills_primary_secondary( skills: List[str], *, clean_text: str, sections: Dict[str, str] | None = None, primary_limit: int = 25, ) -> Tuple[List[str], List[str]]: if not skills: return [], [] text = (clean_text or "").lower() skills_section = (sections or {}).get("skills", "").lower() experience_section = (sections or {}).get("experience", "").lower() scores: Dict[str, float] = {} for sk in skills: s = sk.lower() score = 1.0 if s in skills_section: score += 2.2 if s in experience_section: score += 1.2 count = len(re.findall(r"\b" + re.escape(s) + r"\b", text)) score += min(2.5, count * 0.5) if s in _SKILL_STOP: score -= 1.5 scores[sk] = score ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True) primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit] secondary = [s for s in ranked if s not in primary] return primary, secondary def normalize_location(raw: Optional[str]) -> Optional[str]: if not raw: return None t = raw.strip() low = t.lower() if low in ("москва", "moscow", "moscow, russia"): return "Moscow, Russia" if low in ("санкт-петербург", "спб", "питер", "saint petersburg"): return "Saint Petersburg, Russia" return t def find_skills_in_text(text: str) -> List[str]: if not text: return [] found: List[str] = [] seen = set() low = _normalize_skill_surface(text) for alias, canon in _SKILL_ALIAS.items(): key = _normalize_skill_surface(alias) if key in seen: continue if re.search(r"\b" + re.escape(key) + r"\b", low): if canon not in seen: found.append(canon) seen.add(canon) return found