Initial commit

This commit is contained in:
2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions

174
normalize.py Normal file
View File

@@ -0,0 +1,174 @@
from __future__ import annotations
import re
from typing import Dict, List, Optional, Tuple
_SKILL_SYNONYMS: Dict[str, List[str]] = {
"python": ["py"],
"javascript": ["js", "node", "nodejs", "java script", "java-script"],
"typescript": ["ts", "type script", "type-script"],
"postgresql": ["postgres", "psql"],
"kubernetes": ["k8s"],
"docker": [],
"fastapi": [],
"django": ["drf", "django rest framework"],
"flask": [],
"golang": ["go"],
"c++": ["cpp"],
"c#": ["csharp"],
"redis": [],
"kafka": [],
"rabbitmq": [],
"grpc": [],
"rest": [],
}
_SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
_ROLE_SYNONYMS: Dict[str, List[str]] = {
"backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
"frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
"fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
"devops": ["sre", "site reliability"],
"qa": ["tester", "тестировщик"],
"data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
"mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
}
def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
alias = {}
for canonical, al in src.items():
alias[canonical] = canonical
for a in al:
alias[a] = canonical
return {k.lower(): v for k, v in alias.items()}
_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
def _normalize_skill_surface(token: str) -> str:
t = (token or "").strip().lower()
if not t:
return ""
t = t.replace("/", " ")
t = re.sub(r"[_\-]+", " ", t)
t = re.sub(r"\s+", " ", t).strip()
# "java script", "type script", "postgre sql", "graph ql", "g rpc"
t = re.sub(r"\bjava\s+script\b", "javascript", t)
t = re.sub(r"\btype\s+script\b", "typescript", t)
t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
t = re.sub(r"\bg\s+rpc\b", "grpc", t)
t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
return t
def normalize_skill(token: str) -> Optional[str]:
t = _normalize_skill_surface(token)
if not t:
return None
# Avoid false-positive java from "javascript"
if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
return "javascript"
return _SKILL_ALIAS.get(t, t)
def normalize_skills(skills: List[str]) -> List[str]:
out: List[str] = []
seen = set()
for s in skills or []:
canon = normalize_skill(s)
if not canon or canon in seen:
continue
seen.add(canon)
out.append(canon)
return out
def normalize_role(token: str) -> Optional[str]:
t = (token or "").strip().lower()
if not t:
return None
return _ROLE_ALIAS.get(t, t)
def normalize_roles(roles: List[str]) -> List[str]:
out: List[str] = []
seen = set()
for r in roles or []:
canon = normalize_role(r)
if not canon or canon in seen:
continue
seen.add(canon)
out.append(canon)
return out
def split_skills_primary_secondary(
skills: List[str],
*,
clean_text: str,
sections: Dict[str, str] | None = None,
primary_limit: int = 25,
) -> Tuple[List[str], List[str]]:
if not skills:
return [], []
text = (clean_text or "").lower()
skills_section = (sections or {}).get("skills", "").lower()
experience_section = (sections or {}).get("experience", "").lower()
scores: Dict[str, float] = {}
for sk in skills:
s = sk.lower()
score = 1.0
if s in skills_section:
score += 2.2
if s in experience_section:
score += 1.2
count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
score += min(2.5, count * 0.5)
if s in _SKILL_STOP:
score -= 1.5
scores[sk] = score
ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
secondary = [s for s in ranked if s not in primary]
return primary, secondary
def normalize_location(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
t = raw.strip()
low = t.lower()
if low in ("москва", "moscow", "moscow, russia"):
return "Moscow, Russia"
if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
return "Saint Petersburg, Russia"
return t
def find_skills_in_text(text: str) -> List[str]:
if not text:
return []
found: List[str] = []
seen = set()
low = _normalize_skill_surface(text)
for alias, canon in _SKILL_ALIAS.items():
key = _normalize_skill_surface(alias)
if key in seen:
continue
if re.search(r"\b" + re.escape(key) + r"\b", low):
if canon not in seen:
found.append(canon)
seen.add(canon)
return found