Initial commit
This commit is contained in:
174
normalize.py
Normal file
174
normalize.py
Normal file
@@ -0,0 +1,174 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
_SKILL_SYNONYMS: Dict[str, List[str]] = {
|
||||
"python": ["py"],
|
||||
"javascript": ["js", "node", "nodejs", "java script", "java-script"],
|
||||
"typescript": ["ts", "type script", "type-script"],
|
||||
"postgresql": ["postgres", "psql"],
|
||||
"kubernetes": ["k8s"],
|
||||
"docker": [],
|
||||
"fastapi": [],
|
||||
"django": ["drf", "django rest framework"],
|
||||
"flask": [],
|
||||
"golang": ["go"],
|
||||
"c++": ["cpp"],
|
||||
"c#": ["csharp"],
|
||||
"redis": [],
|
||||
"kafka": [],
|
||||
"rabbitmq": [],
|
||||
"grpc": [],
|
||||
"rest": [],
|
||||
}
|
||||
|
||||
_SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
|
||||
|
||||
_ROLE_SYNONYMS: Dict[str, List[str]] = {
|
||||
"backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
|
||||
"frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
|
||||
"fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
|
||||
"devops": ["sre", "site reliability"],
|
||||
"qa": ["tester", "тестировщик"],
|
||||
"data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
|
||||
"mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
|
||||
}
|
||||
|
||||
|
||||
def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
|
||||
alias = {}
|
||||
for canonical, al in src.items():
|
||||
alias[canonical] = canonical
|
||||
for a in al:
|
||||
alias[a] = canonical
|
||||
return {k.lower(): v for k, v in alias.items()}
|
||||
|
||||
|
||||
_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
|
||||
_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
|
||||
|
||||
|
||||
def _normalize_skill_surface(token: str) -> str:
|
||||
t = (token or "").strip().lower()
|
||||
if not t:
|
||||
return ""
|
||||
t = t.replace("/", " ")
|
||||
t = re.sub(r"[_\-]+", " ", t)
|
||||
t = re.sub(r"\s+", " ", t).strip()
|
||||
|
||||
# "java script", "type script", "postgre sql", "graph ql", "g rpc"
|
||||
t = re.sub(r"\bjava\s+script\b", "javascript", t)
|
||||
t = re.sub(r"\btype\s+script\b", "typescript", t)
|
||||
t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
|
||||
t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
|
||||
t = re.sub(r"\bg\s+rpc\b", "grpc", t)
|
||||
t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
|
||||
return t
|
||||
|
||||
|
||||
def normalize_skill(token: str) -> Optional[str]:
|
||||
t = _normalize_skill_surface(token)
|
||||
if not t:
|
||||
return None
|
||||
|
||||
# Avoid false-positive java from "javascript"
|
||||
if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
|
||||
return "javascript"
|
||||
|
||||
return _SKILL_ALIAS.get(t, t)
|
||||
|
||||
|
||||
def normalize_skills(skills: List[str]) -> List[str]:
|
||||
out: List[str] = []
|
||||
seen = set()
|
||||
for s in skills or []:
|
||||
canon = normalize_skill(s)
|
||||
if not canon or canon in seen:
|
||||
continue
|
||||
seen.add(canon)
|
||||
out.append(canon)
|
||||
return out
|
||||
|
||||
|
||||
def normalize_role(token: str) -> Optional[str]:
|
||||
t = (token or "").strip().lower()
|
||||
if not t:
|
||||
return None
|
||||
return _ROLE_ALIAS.get(t, t)
|
||||
|
||||
|
||||
def normalize_roles(roles: List[str]) -> List[str]:
|
||||
out: List[str] = []
|
||||
seen = set()
|
||||
for r in roles or []:
|
||||
canon = normalize_role(r)
|
||||
if not canon or canon in seen:
|
||||
continue
|
||||
seen.add(canon)
|
||||
out.append(canon)
|
||||
return out
|
||||
|
||||
|
||||
def split_skills_primary_secondary(
|
||||
skills: List[str],
|
||||
*,
|
||||
clean_text: str,
|
||||
sections: Dict[str, str] | None = None,
|
||||
primary_limit: int = 25,
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
if not skills:
|
||||
return [], []
|
||||
|
||||
text = (clean_text or "").lower()
|
||||
skills_section = (sections or {}).get("skills", "").lower()
|
||||
experience_section = (sections or {}).get("experience", "").lower()
|
||||
|
||||
scores: Dict[str, float] = {}
|
||||
for sk in skills:
|
||||
s = sk.lower()
|
||||
score = 1.0
|
||||
if s in skills_section:
|
||||
score += 2.2
|
||||
if s in experience_section:
|
||||
score += 1.2
|
||||
count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
|
||||
score += min(2.5, count * 0.5)
|
||||
if s in _SKILL_STOP:
|
||||
score -= 1.5
|
||||
scores[sk] = score
|
||||
|
||||
ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
|
||||
primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
|
||||
secondary = [s for s in ranked if s not in primary]
|
||||
return primary, secondary
|
||||
|
||||
|
||||
def normalize_location(raw: Optional[str]) -> Optional[str]:
|
||||
if not raw:
|
||||
return None
|
||||
t = raw.strip()
|
||||
low = t.lower()
|
||||
if low in ("москва", "moscow", "moscow, russia"):
|
||||
return "Moscow, Russia"
|
||||
if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
|
||||
return "Saint Petersburg, Russia"
|
||||
return t
|
||||
|
||||
|
||||
def find_skills_in_text(text: str) -> List[str]:
|
||||
if not text:
|
||||
return []
|
||||
found: List[str] = []
|
||||
seen = set()
|
||||
low = _normalize_skill_surface(text)
|
||||
for alias, canon in _SKILL_ALIAS.items():
|
||||
key = _normalize_skill_surface(alias)
|
||||
if key in seen:
|
||||
continue
|
||||
if re.search(r"\b" + re.escape(key) + r"\b", low):
|
||||
if canon not in seen:
|
||||
found.append(canon)
|
||||
seen.add(canon)
|
||||
return found
|
||||
Reference in New Issue
Block a user