Initial commit

2026-03-11 15:27:10 +03:00
commit 8b4b8d54d1
34 changed files with 7407 additions and 0 deletions
--- a/normalize.py
+++ b/normalize.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+
+_SKILL_SYNONYMS: Dict[str, List[str]] = {
+    "python": ["py"],
+    "javascript": ["js", "node", "nodejs", "java script", "java-script"],
+    "typescript": ["ts", "type script", "type-script"],
+    "postgresql": ["postgres", "psql"],
+    "kubernetes": ["k8s"],
+    "docker": [],
+    "fastapi": [],
+    "django": ["drf", "django rest framework"],
+    "flask": [],
+    "golang": ["go"],
+    "c++": ["cpp"],
+    "c#": ["csharp"],
+    "redis": [],
+    "kafka": [],
+    "rabbitmq": [],
+    "grpc": [],
+    "rest": [],
+}
+
+_SKILL_STOP = {"rest", "http", "json", "xml", "oop"}
+
+_ROLE_SYNONYMS: Dict[str, List[str]] = {
+    "backend": ["backend developer", "backend engineer", "бэкенд", "бекенд", "серверный разработчик"],
+    "frontend": ["frontend developer", "frontend engineer", "фронтенд", "фронт"],
+    "fullstack": ["full stack", "full-stack", "фулстек", "fullstack developer"],
+    "devops": ["sre", "site reliability"],
+    "qa": ["tester", "тестировщик"],
+    "data": ["data engineer", "data scientist", "ml engineer", "машинное обучение"],
+    "mobile": ["android", "ios", "mobile developer", "мобильный разработчик"],
+}
+
+
+def _build_alias_map(src: Dict[str, List[str]]) -> Dict[str, str]:
+    alias = {}
+    for canonical, al in src.items():
+        alias[canonical] = canonical
+        for a in al:
+            alias[a] = canonical
+    return {k.lower(): v for k, v in alias.items()}
+
+
+_SKILL_ALIAS = _build_alias_map(_SKILL_SYNONYMS)
+_ROLE_ALIAS = _build_alias_map(_ROLE_SYNONYMS)
+
+
+def _normalize_skill_surface(token: str) -> str:
+    t = (token or "").strip().lower()
+    if not t:
+        return ""
+    t = t.replace("/", " ")
+    t = re.sub(r"[_\-]+", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()
+
+    # "java script", "type script", "postgre sql", "graph ql", "g rpc"
+    t = re.sub(r"\bjava\s+script\b", "javascript", t)
+    t = re.sub(r"\btype\s+script\b", "typescript", t)
+    t = re.sub(r"\bpostgre\s+sql\b", "postgresql", t)
+    t = re.sub(r"\bgraph\s+ql\b", "graphql", t)
+    t = re.sub(r"\bg\s+rpc\b", "grpc", t)
+    t = re.sub(r"\bdocker\s+compose\b", "docker compose", t)
+    return t
+
+
+def normalize_skill(token: str) -> Optional[str]:
+    t = _normalize_skill_surface(token)
+    if not t:
+        return None
+
+    # Avoid false-positive java from "javascript"
+    if t == "java" and re.search(r"\bjava\s*script\b", _normalize_skill_surface(token)):
+        return "javascript"
+
+    return _SKILL_ALIAS.get(t, t)
+
+
+def normalize_skills(skills: List[str]) -> List[str]:
+    out: List[str] = []
+    seen = set()
+    for s in skills or []:
+        canon = normalize_skill(s)
+        if not canon or canon in seen:
+            continue
+        seen.add(canon)
+        out.append(canon)
+    return out
+
+
+def normalize_role(token: str) -> Optional[str]:
+    t = (token or "").strip().lower()
+    if not t:
+        return None
+    return _ROLE_ALIAS.get(t, t)
+
+
+def normalize_roles(roles: List[str]) -> List[str]:
+    out: List[str] = []
+    seen = set()
+    for r in roles or []:
+        canon = normalize_role(r)
+        if not canon or canon in seen:
+            continue
+        seen.add(canon)
+        out.append(canon)
+    return out
+
+
+def split_skills_primary_secondary(
+    skills: List[str],
+    *,
+    clean_text: str,
+    sections: Dict[str, str] | None = None,
+    primary_limit: int = 25,
+) -> Tuple[List[str], List[str]]:
+    if not skills:
+        return [], []
+
+    text = (clean_text or "").lower()
+    skills_section = (sections or {}).get("skills", "").lower()
+    experience_section = (sections or {}).get("experience", "").lower()
+
+    scores: Dict[str, float] = {}
+    for sk in skills:
+        s = sk.lower()
+        score = 1.0
+        if s in skills_section:
+            score += 2.2
+        if s in experience_section:
+            score += 1.2
+        count = len(re.findall(r"\b" + re.escape(s) + r"\b", text))
+        score += min(2.5, count * 0.5)
+        if s in _SKILL_STOP:
+            score -= 1.5
+        scores[sk] = score
+
+    ranked = sorted(skills, key=lambda x: scores.get(x, 0.0), reverse=True)
+    primary = [s for s in ranked if scores.get(s, 0.0) >= 2.0][:primary_limit]
+    secondary = [s for s in ranked if s not in primary]
+    return primary, secondary
+
+
+def normalize_location(raw: Optional[str]) -> Optional[str]:
+    if not raw:
+        return None
+    t = raw.strip()
+    low = t.lower()
+    if low in ("москва", "moscow", "moscow, russia"):
+        return "Moscow, Russia"
+    if low in ("санкт-петербург", "спб", "питер", "saint petersburg"):
+        return "Saint Petersburg, Russia"
+    return t
+
+
+def find_skills_in_text(text: str) -> List[str]:
+    if not text:
+        return []
+    found: List[str] = []
+    seen = set()
+    low = _normalize_skill_surface(text)
+    for alias, canon in _SKILL_ALIAS.items():
+        key = _normalize_skill_surface(alias)
+        if key in seen:
+            continue
+        if re.search(r"\b" + re.escape(key) + r"\b", low):
+            if canon not in seen:
+                found.append(canon)
+                seen.add(canon)
+    return found