175 lines
5.1 KiB
Python
175 lines
5.1 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
_SKILL_SYNONYMS: Dict[str, List[str]] = {
|
|
"python": ["py"],
|
|
"javascript": ["js", "node", "nodejs", "java script", "java-script"],
|
|
"typescript": ["ts", "type script", "type-script"],
|
|
"postgresql": ["postgres", "psql"],
|
|
"kubernetes": ["k8s"],
|
|
"docker": [],
|
|
"fastapi": [],
|
|
"django": ["drf", "django rest framework"],
|
|
"flask": [],
|
|
"golang": ["go"],
|
|
"c++": ["cpp"],
|
|
"c#": ["csharp"],
|
|
"redis": [],
|
|
"kafka": [],
|
|
"rabbitmq": [],
|
|
"grpc": [],
|
|
"rest": [],
|
|
}
|
|
|
|
_SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
|
|
|
|
_ROLE_SYNONYMS: Dict[str, List[str]] = {
|
|
"backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
|
|
"frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
|
|
"fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
|
|
"devops": ["sre", "site reliability"],
|
|
"qa": ["tester", "тестировщик"],
|
|
"data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
|
|
"mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
|
|
}
|
|
|
|
|
|
def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
|
|
alias = {}
|
|
for canonical, al in src.items():
|
|
alias[canonical] = canonical
|
|
for a in al:
|
|
alias[a] = canonical
|
|
return {k.lower(): v for k, v in alias.items()}
|
|
|
|
|
|
_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
|
|
_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
|
|
|
|
|
|
def _normalize_skill_surface(token: str) -> str:
|
|
t = (token or "").strip().lower()
|
|
if not t:
|
|
return ""
|
|
t = t.replace("/", " ")
|
|
t = re.sub(r"[_\-]+", " ", t)
|
|
t = re.sub(r"\s+", " ", t).strip()
|
|
|
|
# "java script", "type script", "postgre sql", "graph ql", "g rpc"
|
|
t = re.sub(r"\bjava\s+script\b", "javascript", t)
|
|
t = re.sub(r"\btype\s+script\b", "typescript", t)
|
|
t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
|
|
t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
|
|
t = re.sub(r"\bg\s+rpc\b", "grpc", t)
|
|
t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
|
|
return t
|
|
|
|
|
|
def normalize_skill(token: str) -> Optional[str]:
|
|
t = _normalize_skill_surface(token)
|
|
if not t:
|
|
return None
|
|
|
|
# Avoid false-positive java from "javascript"
|
|
if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
|
|
return "javascript"
|
|
|
|
return _SKILL_ALIAS.get(t, t)
|
|
|
|
|
|
def normalize_skills(skills: List[str]) -> List[str]:
|
|
out: List[str] = []
|
|
seen = set()
|
|
for s in skills or []:
|
|
canon = normalize_skill(s)
|
|
if not canon or canon in seen:
|
|
continue
|
|
seen.add(canon)
|
|
out.append(canon)
|
|
return out
|
|
|
|
|
|
def normalize_role(token: str) -> Optional[str]:
|
|
t = (token or "").strip().lower()
|
|
if not t:
|
|
return None
|
|
return _ROLE_ALIAS.get(t, t)
|
|
|
|
|
|
def normalize_roles(roles: List[str]) -> List[str]:
|
|
out: List[str] = []
|
|
seen = set()
|
|
for r in roles or []:
|
|
canon = normalize_role(r)
|
|
if not canon or canon in seen:
|
|
continue
|
|
seen.add(canon)
|
|
out.append(canon)
|
|
return out
|
|
|
|
|
|
def split_skills_primary_secondary(
|
|
skills: List[str],
|
|
*,
|
|
clean_text: str,
|
|
sections: Dict[str, str] | None = None,
|
|
primary_limit: int = 25,
|
|
) -> Tuple[List[str], List[str]]:
|
|
if not skills:
|
|
return [], []
|
|
|
|
text = (clean_text or "").lower()
|
|
skills_section = (sections or {}).get("skills", "").lower()
|
|
experience_section = (sections or {}).get("experience", "").lower()
|
|
|
|
scores: Dict[str, float] = {}
|
|
for sk in skills:
|
|
s = sk.lower()
|
|
score = 1.0
|
|
if s in skills_section:
|
|
score += 2.2
|
|
if s in experience_section:
|
|
score += 1.2
|
|
count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
|
|
score += min(2.5, count * 0.5)
|
|
if s in _SKILL_STOP:
|
|
score -= 1.5
|
|
scores[sk] = score
|
|
|
|
ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
|
|
primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
|
|
secondary = [s for s in ranked if s not in primary]
|
|
return primary, secondary
|
|
|
|
|
|
def normalize_location(raw: Optional[str]) -> Optional[str]:
|
|
if not raw:
|
|
return None
|
|
t = raw.strip()
|
|
low = t.lower()
|
|
if low in ("москва", "moscow", "moscow, russia"):
|
|
return "Moscow, Russia"
|
|
if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
|
|
return "Saint Petersburg, Russia"
|
|
return t
|
|
|
|
|
|
def find_skills_in_text(text: str) -> List[str]:
|
|
if not text:
|
|
return []
|
|
found: List[str] = []
|
|
seen = set()
|
|
low = _normalize_skill_surface(text)
|
|
for alias, canon in _SKILL_ALIAS.items():
|
|
key = _normalize_skill_surface(alias)
|
|
if key in seen:
|
|
continue
|
|
if re.search(r"\b" + re.escape(key) + r"\b", low):
|
|
if canon not in seen:
|
|
found.append(canon)
|
|
seen.add(canon)
|
|
return found
|