Spaces:
Sleeping
Sleeping
CRITICAL FIX: disable ML models for keyword extraction on free CPU - use regex+TF-IDF only (instant, no hanging)"
Browse files- keyword_extractor.py +134 -202
keyword_extractor.py
CHANGED
|
@@ -1,130 +1,23 @@
|
|
| 1 |
"""
|
| 2 |
Component 2: JD Keyword Extractor
|
| 3 |
-
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import re
|
| 7 |
-
import os
|
| 8 |
import logging
|
| 9 |
-
import signal
|
| 10 |
-
import traceback
|
| 11 |
from typing import List, Dict, Tuple, Optional
|
| 12 |
-
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
| 13 |
|
| 14 |
logging.basicConfig(level=logging.INFO)
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
-
# Timeout for individual model calls (seconds)
|
| 18 |
-
MODEL_CALL_TIMEOUT = 30
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
global _ner_pipeline, _keybert_model, _models_loaded
|
| 27 |
-
if _models_loaded:
|
| 28 |
-
return
|
| 29 |
-
|
| 30 |
-
try:
|
| 31 |
-
logger.info("Loading NER skill extraction model...")
|
| 32 |
-
from transformers import pipeline
|
| 33 |
-
_ner_pipeline = pipeline(
|
| 34 |
-
task="token-classification",
|
| 35 |
-
model="algiraldohe/lm-ner-linkedin-skills-recognition",
|
| 36 |
-
aggregation_strategy="simple",
|
| 37 |
-
device=-1,
|
| 38 |
-
)
|
| 39 |
-
logger.info("β
NER model loaded")
|
| 40 |
-
except Exception as e:
|
| 41 |
-
logger.warning(f"β οΈ NER model failed: {e}")
|
| 42 |
-
_ner_pipeline = None
|
| 43 |
-
|
| 44 |
-
try:
|
| 45 |
-
logger.info("Loading KeyBERT model...")
|
| 46 |
-
from keybert import KeyBERT
|
| 47 |
-
_keybert_model = KeyBERT(model="all-MiniLM-L6-v2")
|
| 48 |
-
logger.info("β
KeyBERT model loaded")
|
| 49 |
-
except Exception as e:
|
| 50 |
-
logger.warning(f"β οΈ KeyBERT model failed: {e}")
|
| 51 |
-
_keybert_model = None
|
| 52 |
-
|
| 53 |
-
_models_loaded = True
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
_load_models()
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def _run_with_timeout(fn, timeout=MODEL_CALL_TIMEOUT, default=None):
|
| 60 |
-
"""Run a function with a timeout. Returns default if it times out or errors."""
|
| 61 |
-
try:
|
| 62 |
-
with ThreadPoolExecutor(max_workers=1) as executor:
|
| 63 |
-
future = executor.submit(fn)
|
| 64 |
-
return future.result(timeout=timeout)
|
| 65 |
-
except FuturesTimeoutError:
|
| 66 |
-
logger.warning(f"β οΈ {fn.__name__ if hasattr(fn, '__name__') else 'function'} timed out after {timeout}s")
|
| 67 |
-
return default
|
| 68 |
-
except Exception as e:
|
| 69 |
-
logger.warning(f"β οΈ {fn.__name__ if hasattr(fn, '__name__') else 'function'} error: {e}")
|
| 70 |
-
return default
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def extract_skills_ner(text: str, min_score: float = 0.65) -> Dict[str, List[str]]:
|
| 74 |
-
empty = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
|
| 75 |
-
if _ner_pipeline is None:
|
| 76 |
-
return empty
|
| 77 |
-
|
| 78 |
-
def _do_ner():
|
| 79 |
-
max_chars = 1800
|
| 80 |
-
chunks = [text[i:i + max_chars] for i in range(0, len(text), max_chars)]
|
| 81 |
-
all_entities = []
|
| 82 |
-
for chunk in chunks:
|
| 83 |
-
try:
|
| 84 |
-
entities = _ner_pipeline(chunk)
|
| 85 |
-
all_entities.extend(entities)
|
| 86 |
-
except Exception:
|
| 87 |
-
continue
|
| 88 |
-
|
| 89 |
-
skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
|
| 90 |
-
seen = set()
|
| 91 |
-
for ent in all_entities:
|
| 92 |
-
label = ent.get("entity_group", "")
|
| 93 |
-
word = ent.get("word", "").strip()
|
| 94 |
-
score = ent.get("score", 0)
|
| 95 |
-
word = re.sub(r'^[#\s]+|[#\s]+$', '', word)
|
| 96 |
-
word = re.sub(r'\s+', ' ', word)
|
| 97 |
-
if not word or len(word) < 2 or score < min_score:
|
| 98 |
-
continue
|
| 99 |
-
word_lower = word.lower()
|
| 100 |
-
if label in skills and word_lower not in seen:
|
| 101 |
-
seen.add(word_lower)
|
| 102 |
-
skills[label].append(word)
|
| 103 |
-
return skills
|
| 104 |
-
|
| 105 |
-
result = _run_with_timeout(_do_ner, timeout=MODEL_CALL_TIMEOUT, default=empty)
|
| 106 |
-
return result if result is not None else empty
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def extract_keywords_keybert(text: str, top_n: int = 20) -> List[Tuple[str, float]]:
|
| 110 |
-
if _keybert_model is None:
|
| 111 |
-
return []
|
| 112 |
-
|
| 113 |
-
def _do_keybert():
|
| 114 |
-
return _keybert_model.extract_keywords(
|
| 115 |
-
text,
|
| 116 |
-
keyphrase_ngram_range=(1, 3),
|
| 117 |
-
stop_words="english",
|
| 118 |
-
use_maxsum=True,
|
| 119 |
-
nr_candidates=40,
|
| 120 |
-
top_n=top_n,
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
result = _run_with_timeout(_do_keybert, timeout=MODEL_CALL_TIMEOUT, default=[])
|
| 124 |
-
return result if result is not None else []
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def extract_keywords_tfidf(text: str, top_n: int = 15) -> List[str]:
|
| 128 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 129 |
import numpy as np
|
| 130 |
|
|
@@ -135,125 +28,184 @@ def extract_keywords_tfidf(text: str, top_n: int = 15) -> List[str]:
|
|
| 135 |
scores = tfidf_matrix.toarray()[0]
|
| 136 |
top_indices = np.argsort(scores)[::-1][:top_n]
|
| 137 |
return [feature_names[i] for i in top_indices if scores[i] > 0]
|
| 138 |
-
except Exception:
|
|
|
|
| 139 |
return []
|
| 140 |
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
TECH_SKILLS_VOCAB = {
|
| 143 |
"TECHNOLOGY": [
|
| 144 |
"python", "java", "javascript", "typescript", "c\\+\\+", "c#", "ruby", "go",
|
| 145 |
"rust", "scala", "kotlin", "swift", "php", "matlab", "julia",
|
| 146 |
-
"sql", "nosql", "html", "css", "bash", "shell",
|
| 147 |
-
"tensorflow", "pytorch", "keras", "scikit-learn",
|
| 148 |
-
"pandas", "numpy", "scipy", "matplotlib",
|
| 149 |
-
"spark", "hadoop", "kafka", "airflow", "dbt",
|
| 150 |
-
"docker", "kubernetes", "terraform", "ansible",
|
| 151 |
-
"aws", "azure", "gcp", "google cloud",
|
| 152 |
-
"react", "angular", "vue", "
|
| 153 |
-
"
|
| 154 |
-
"
|
| 155 |
-
"
|
| 156 |
-
"
|
| 157 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
],
|
| 159 |
"TECHNICAL": [
|
| 160 |
-
"machine learning", "deep learning", "neural network", "
|
| 161 |
-
"natural language processing", "computer vision",
|
|
|
|
| 162 |
"data science", "data engineering", "data analytics", "data analysis",
|
| 163 |
-
"
|
| 164 |
-
"
|
| 165 |
-
"
|
| 166 |
-
"
|
| 167 |
-
"
|
| 168 |
-
"
|
| 169 |
-
"
|
| 170 |
-
"
|
| 171 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
],
|
| 173 |
"SOFT": [
|
| 174 |
"leadership", "communication", "teamwork", "collaboration",
|
| 175 |
-
"problem solving", "
|
| 176 |
-
"
|
| 177 |
-
"
|
| 178 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
],
|
| 180 |
}
|
| 181 |
|
| 182 |
|
| 183 |
def extract_skills_regex(text: str) -> Dict[str, List[str]]:
|
|
|
|
| 184 |
text_lower = text.lower()
|
| 185 |
skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
|
| 186 |
seen = set()
|
|
|
|
| 187 |
for category, patterns in TECH_SKILLS_VOCAB.items():
|
| 188 |
for pattern in patterns:
|
| 189 |
try:
|
| 190 |
-
if re.search(r'\b' + pattern + r'\b', text_lower
|
| 191 |
-
clean = pattern.replace("\\+", "+").replace("\\.", ".")
|
| 192 |
if clean not in seen:
|
| 193 |
seen.add(clean)
|
| 194 |
display = clean.title() if len(clean) > 3 else clean.upper()
|
| 195 |
skills[category].append(display)
|
| 196 |
except Exception:
|
| 197 |
continue
|
|
|
|
| 198 |
return skills
|
| 199 |
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def extract_all_keywords(
|
| 202 |
job_description: str,
|
| 203 |
resume_text: Optional[str] = None,
|
| 204 |
) -> Dict:
|
| 205 |
-
"""
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
logger.info("
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
ner_skills = extract_skills_regex(job_description)
|
| 218 |
-
logger.info(f" Layer 1 done: {sum(len(v) for v in ner_skills.values())} skills found")
|
| 219 |
-
|
| 220 |
-
# Layer 2: KeyBERT (with timeout)
|
| 221 |
-
logger.info(" Layer 2: KeyBERT extraction...")
|
| 222 |
-
keybert_kws = extract_keywords_keybert(job_description, top_n=20)
|
| 223 |
-
logger.info(f" Layer 2 done: {len(keybert_kws)} keywords found")
|
| 224 |
-
|
| 225 |
-
# Layer 3: TF-IDF (always fast)
|
| 226 |
-
logger.info(" Layer 3: TF-IDF extraction...")
|
| 227 |
tfidf_terms = extract_keywords_tfidf(job_description, top_n=20)
|
| 228 |
-
logger.info(f"
|
| 229 |
|
| 230 |
# Combine
|
| 231 |
all_kw_set = set()
|
| 232 |
for category, skills in ner_skills.items():
|
| 233 |
for skill in skills:
|
| 234 |
all_kw_set.add(skill.lower())
|
| 235 |
-
for kw, score in keybert_kws:
|
| 236 |
-
all_kw_set.add(kw.lower())
|
| 237 |
for term in tfidf_terms:
|
| 238 |
all_kw_set.add(term.lower())
|
| 239 |
|
| 240 |
all_keywords_flat = sorted(all_kw_set)
|
| 241 |
|
| 242 |
-
# Required keywords
|
| 243 |
required = set()
|
| 244 |
for category, skills in ner_skills.items():
|
| 245 |
for skill in skills:
|
| 246 |
required.add(skill.lower())
|
| 247 |
-
for
|
| 248 |
-
|
| 249 |
-
required.add(kw.lower())
|
| 250 |
-
if not required and tfidf_terms:
|
| 251 |
-
for term in tfidf_terms[:10]:
|
| 252 |
-
required.add(term.lower())
|
| 253 |
|
| 254 |
required_keywords = sorted(required)
|
| 255 |
|
| 256 |
-
#
|
| 257 |
resume_keywords = []
|
| 258 |
missing_keywords = []
|
| 259 |
if resume_text:
|
|
@@ -264,11 +216,11 @@ def extract_all_keywords(
|
|
| 264 |
else:
|
| 265 |
missing_keywords.append(kw)
|
| 266 |
|
| 267 |
-
logger.info(f"
|
| 268 |
|
| 269 |
return {
|
| 270 |
"ner_skills": ner_skills,
|
| 271 |
-
"keybert_keywords":
|
| 272 |
"tfidf_terms": tfidf_terms,
|
| 273 |
"all_keywords_flat": all_keywords_flat,
|
| 274 |
"required_keywords": required_keywords,
|
|
@@ -279,24 +231,12 @@ def extract_all_keywords(
|
|
| 279 |
|
| 280 |
def format_keywords_report(kw_data: Dict) -> str:
|
| 281 |
lines = []
|
| 282 |
-
methods_active = []
|
| 283 |
-
if _ner_pipeline is not None:
|
| 284 |
-
methods_active.append("NER β
")
|
| 285 |
-
else:
|
| 286 |
-
methods_active.append("NER β (regex fallback)")
|
| 287 |
-
if _keybert_model is not None:
|
| 288 |
-
methods_active.append("KeyBERT β
")
|
| 289 |
-
else:
|
| 290 |
-
methods_active.append("KeyBERT β")
|
| 291 |
-
methods_active.append("TF-IDF β
")
|
| 292 |
-
|
| 293 |
lines.append("βββ KEYWORD EXTRACTION REPORT βββ")
|
| 294 |
-
lines.append(
|
| 295 |
|
| 296 |
ner = kw_data["ner_skills"]
|
| 297 |
if any(ner.values()):
|
| 298 |
-
|
| 299 |
-
lines.append(label)
|
| 300 |
if ner.get("TECHNOLOGY"):
|
| 301 |
lines.append(f" π» Technology: {', '.join(ner['TECHNOLOGY'])}")
|
| 302 |
if ner.get("TECHNICAL"):
|
|
@@ -307,17 +247,9 @@ def format_keywords_report(kw_data: Dict) -> str:
|
|
| 307 |
lines.append(f" π€ Soft Skills: {', '.join(ner['SOFT'])}")
|
| 308 |
lines.append("")
|
| 309 |
|
| 310 |
-
kb_kws = kw_data["keybert_keywords"]
|
| 311 |
-
if kb_kws:
|
| 312 |
-
lines.append("π Key Phrases (Semantic):")
|
| 313 |
-
for kw, score in kb_kws[:10]:
|
| 314 |
-
bar = "β" * int(score * 20) + "β" * (20 - int(score * 20))
|
| 315 |
-
lines.append(f" {bar} {score:.2f} {kw}")
|
| 316 |
-
lines.append("")
|
| 317 |
-
|
| 318 |
tfidf = kw_data.get("tfidf_terms", [])
|
| 319 |
if tfidf:
|
| 320 |
-
lines.append(f"π TF-IDF Terms: {', '.join(tfidf[:15])}")
|
| 321 |
lines.append("")
|
| 322 |
|
| 323 |
required = kw_data["required_keywords"]
|
|
|
|
| 1 |
"""
|
| 2 |
Component 2: JD Keyword Extractor
|
| 3 |
+
Uses regex skill matching + TF-IDF for instant, reliable keyword extraction.
|
| 4 |
+
No ML model inference during requests β works instantly on free CPU Spaces.
|
| 5 |
"""
|
| 6 |
|
| 7 |
import re
|
|
|
|
| 8 |
import logging
|
|
|
|
|
|
|
| 9 |
from typing import List, Dict, Tuple, Optional
|
|
|
|
| 10 |
|
| 11 |
logging.basicConfig(level=logging.INFO)
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
# TF-IDF EXTRACTION (always fast, no model needed)
|
| 17 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
|
| 19 |
+
def extract_keywords_tfidf(text: str, top_n: int = 20) -> List[str]:
|
| 20 |
+
"""Extract important terms using TF-IDF. Always works instantly."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 22 |
import numpy as np
|
| 23 |
|
|
|
|
| 28 |
scores = tfidf_matrix.toarray()[0]
|
| 29 |
top_indices = np.argsort(scores)[::-1][:top_n]
|
| 30 |
return [feature_names[i] for i in top_indices if scores[i] > 0]
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.warning(f"TF-IDF error: {e}")
|
| 33 |
return []
|
| 34 |
|
| 35 |
|
| 36 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
# REGEX SKILL EXTRACTION (comprehensive vocabulary, instant)
|
| 38 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
|
| 40 |
TECH_SKILLS_VOCAB = {
|
| 41 |
"TECHNOLOGY": [
|
| 42 |
"python", "java", "javascript", "typescript", "c\\+\\+", "c#", "ruby", "go",
|
| 43 |
"rust", "scala", "kotlin", "swift", "php", "matlab", "julia",
|
| 44 |
+
"sql", "nosql", "html", "css", "bash", "shell", "perl",
|
| 45 |
+
"tensorflow", "pytorch", "keras", "scikit-learn", "sklearn",
|
| 46 |
+
"pandas", "numpy", "scipy", "matplotlib", "seaborn", "plotly",
|
| 47 |
+
"spark", "pyspark", "hadoop", "kafka", "airflow", "dbt", "flink",
|
| 48 |
+
"docker", "kubernetes", "k8s", "terraform", "ansible", "helm",
|
| 49 |
+
"aws", "azure", "gcp", "google cloud", "heroku",
|
| 50 |
+
"react", "angular", "vue", "next\\.?js", "node\\.?js", "express",
|
| 51 |
+
"django", "flask", "fastapi", "spring", "rails",
|
| 52 |
+
"postgresql", "mysql", "mongodb", "redis", "elasticsearch", "cassandra",
|
| 53 |
+
"dynamodb", "snowflake", "bigquery", "redshift", "databricks",
|
| 54 |
+
"git", "jenkins", "ci/cd", "github", "gitlab", "bitbucket",
|
| 55 |
+
"tableau", "power bi", "looker", "grafana", "superset",
|
| 56 |
+
"mlflow", "kubeflow", "wandb", "dvc", "sagemaker",
|
| 57 |
+
"bert", "gpt", "llm", "llms", "langchain", "openai",
|
| 58 |
+
"hugging face", "transformers", "anthropic", "gemini",
|
| 59 |
+
"jira", "confluence", "slack", "notion",
|
| 60 |
+
"linux", "unix", "windows", "macos",
|
| 61 |
+
"excel", "powerpoint", "word",
|
| 62 |
+
"figma", "sketch", "adobe",
|
| 63 |
+
"selenium", "cypress", "jest", "pytest",
|
| 64 |
+
"nginx", "apache", "tomcat",
|
| 65 |
+
"rabbitmq", "celery", "cron",
|
| 66 |
+
"s3", "ec2", "lambda", "ecs", "eks",
|
| 67 |
+
"vpc", "iam", "cloudformation", "cdk",
|
| 68 |
],
|
| 69 |
"TECHNICAL": [
|
| 70 |
+
"machine learning", "deep learning", "neural network", "neural networks",
|
| 71 |
+
"nlp", "natural language processing", "computer vision",
|
| 72 |
+
"reinforcement learning", "transfer learning", "federated learning",
|
| 73 |
"data science", "data engineering", "data analytics", "data analysis",
|
| 74 |
+
"data modeling", "data governance", "data quality",
|
| 75 |
+
"statistical analysis", "statistics", "bayesian", "hypothesis testing",
|
| 76 |
+
"a/b testing", "experimentation", "causal inference",
|
| 77 |
+
"etl", "elt", "data pipeline", "data pipelines",
|
| 78 |
+
"data warehouse", "data lake", "data mesh", "data catalog",
|
| 79 |
+
"api", "rest", "restful", "graphql", "grpc", "websocket",
|
| 80 |
+
"microservices", "distributed systems", "event driven",
|
| 81 |
+
"cloud computing", "cloud architecture", "serverless",
|
| 82 |
+
"devops", "mlops", "dataops", "gitops", "ci/cd",
|
| 83 |
+
"agile", "scrum", "kanban", "lean",
|
| 84 |
+
"recommendation system", "recommendation engine", "recommender",
|
| 85 |
+
"search engine", "information retrieval", "ranking",
|
| 86 |
+
"time series", "forecasting", "anomaly detection", "fraud detection",
|
| 87 |
+
"classification", "regression", "clustering", "segmentation",
|
| 88 |
+
"dimensionality reduction", "feature engineering", "feature selection",
|
| 89 |
+
"model deployment", "model serving", "model monitoring",
|
| 90 |
+
"generative ai", "gen ai", "rag", "retrieval augmented",
|
| 91 |
+
"fine-tuning", "fine tuning", "prompt engineering", "few-shot",
|
| 92 |
+
"object detection", "image segmentation", "image classification",
|
| 93 |
+
"speech recognition", "text to speech", "sentiment analysis",
|
| 94 |
+
"named entity recognition", "text classification", "summarization",
|
| 95 |
+
"question answering", "chatbot", "conversational ai",
|
| 96 |
+
"optimization", "linear programming", "operations research",
|
| 97 |
+
"simulation", "monte carlo", "ab testing",
|
| 98 |
+
"web scraping", "data collection", "annotation",
|
| 99 |
+
"unit testing", "integration testing", "test driven",
|
| 100 |
+
"system design", "software architecture", "design patterns",
|
| 101 |
+
"database design", "schema design", "normalization",
|
| 102 |
+
"version control", "code review", "pair programming",
|
| 103 |
+
"containerization", "orchestration", "infrastructure as code",
|
| 104 |
+
"monitoring", "logging", "alerting", "observability",
|
| 105 |
+
"security", "encryption", "authentication", "authorization",
|
| 106 |
+
"performance optimization", "scalability", "reliability",
|
| 107 |
+
"data visualization", "dashboard", "reporting",
|
| 108 |
+
"technical writing", "documentation",
|
| 109 |
+
"research", "publications", "peer reviewed",
|
| 110 |
+
],
|
| 111 |
+
"BUS": [
|
| 112 |
+
"business intelligence", "bi", "kpi", "roi", "revenue",
|
| 113 |
+
"product management", "product development", "product strategy",
|
| 114 |
+
"business analysis", "requirements gathering", "process improvement",
|
| 115 |
+
"customer success", "customer experience", "user experience",
|
| 116 |
+
"marketing analytics", "growth", "acquisition", "retention",
|
| 117 |
+
"financial analysis", "budgeting", "forecasting",
|
| 118 |
+
"supply chain", "logistics", "inventory",
|
| 119 |
+
"risk management", "compliance", "regulatory",
|
| 120 |
+
"consulting", "advisory", "strategy",
|
| 121 |
],
|
| 122 |
"SOFT": [
|
| 123 |
"leadership", "communication", "teamwork", "collaboration",
|
| 124 |
+
"problem solving", "problem-solving", "critical thinking",
|
| 125 |
+
"analytical", "analytical thinking",
|
| 126 |
+
"mentoring", "mentorship", "coaching",
|
| 127 |
+
"presentation", "public speaking",
|
| 128 |
+
"stakeholder management", "stakeholder engagement",
|
| 129 |
+
"project management", "program management", "time management",
|
| 130 |
+
"cross-functional", "cross functional",
|
| 131 |
+
"strategic thinking", "strategic planning",
|
| 132 |
+
"decision making", "decision-making",
|
| 133 |
+
"conflict resolution", "negotiation",
|
| 134 |
+
"adaptability", "flexibility", "resilience",
|
| 135 |
+
"creativity", "innovation", "initiative",
|
| 136 |
+
"attention to detail", "detail oriented", "detail-oriented",
|
| 137 |
+
"self-motivated", "self motivated", "proactive",
|
| 138 |
+
"interpersonal", "relationship building",
|
| 139 |
+
"emotional intelligence", "empathy",
|
| 140 |
],
|
| 141 |
}
|
| 142 |
|
| 143 |
|
| 144 |
def extract_skills_regex(text: str) -> Dict[str, List[str]]:
|
| 145 |
+
"""Regex-based skill extraction. Instant, no model needed."""
|
| 146 |
text_lower = text.lower()
|
| 147 |
skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
|
| 148 |
seen = set()
|
| 149 |
+
|
| 150 |
for category, patterns in TECH_SKILLS_VOCAB.items():
|
| 151 |
for pattern in patterns:
|
| 152 |
try:
|
| 153 |
+
if re.search(r'(?:^|\b|[\s,;(])' + pattern + r'(?:$|\b|[\s,;)])', text_lower):
|
| 154 |
+
clean = pattern.replace("\\+", "+").replace("\\.", ".").replace("\\?", "")
|
| 155 |
if clean not in seen:
|
| 156 |
seen.add(clean)
|
| 157 |
display = clean.title() if len(clean) > 3 else clean.upper()
|
| 158 |
skills[category].append(display)
|
| 159 |
except Exception:
|
| 160 |
continue
|
| 161 |
+
|
| 162 |
return skills
|
| 163 |
|
| 164 |
|
| 165 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
# MAIN EXTRACTION FUNCTION
|
| 167 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
|
| 169 |
def extract_all_keywords(
|
| 170 |
job_description: str,
|
| 171 |
resume_text: Optional[str] = None,
|
| 172 |
) -> Dict:
|
| 173 |
+
"""
|
| 174 |
+
Fast keyword extraction using regex + TF-IDF.
|
| 175 |
+
No ML model inference β works instantly on any hardware.
|
| 176 |
+
"""
|
| 177 |
+
logger.info("Starting keyword extraction (regex + TF-IDF)...")
|
| 178 |
+
|
| 179 |
+
# Layer 1: Regex skill detection (instant)
|
| 180 |
+
ner_skills = extract_skills_regex(job_description)
|
| 181 |
+
skill_count = sum(len(v) for v in ner_skills.values())
|
| 182 |
+
logger.info(f" Regex skills: {skill_count} found")
|
| 183 |
+
|
| 184 |
+
# Layer 2: TF-IDF terms (instant)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
tfidf_terms = extract_keywords_tfidf(job_description, top_n=20)
|
| 186 |
+
logger.info(f" TF-IDF terms: {len(tfidf_terms)} found")
|
| 187 |
|
| 188 |
# Combine
|
| 189 |
all_kw_set = set()
|
| 190 |
for category, skills in ner_skills.items():
|
| 191 |
for skill in skills:
|
| 192 |
all_kw_set.add(skill.lower())
|
|
|
|
|
|
|
| 193 |
for term in tfidf_terms:
|
| 194 |
all_kw_set.add(term.lower())
|
| 195 |
|
| 196 |
all_keywords_flat = sorted(all_kw_set)
|
| 197 |
|
| 198 |
+
# Required keywords = all regex-matched skills + top TF-IDF
|
| 199 |
required = set()
|
| 200 |
for category, skills in ner_skills.items():
|
| 201 |
for skill in skills:
|
| 202 |
required.add(skill.lower())
|
| 203 |
+
for term in tfidf_terms[:10]:
|
| 204 |
+
required.add(term.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
required_keywords = sorted(required)
|
| 207 |
|
| 208 |
+
# Find what's missing from resume
|
| 209 |
resume_keywords = []
|
| 210 |
missing_keywords = []
|
| 211 |
if resume_text:
|
|
|
|
| 216 |
else:
|
| 217 |
missing_keywords.append(kw)
|
| 218 |
|
| 219 |
+
logger.info(f" Total: {len(all_keywords_flat)} keywords, {len(required_keywords)} required, {len(missing_keywords)} missing")
|
| 220 |
|
| 221 |
return {
|
| 222 |
"ner_skills": ner_skills,
|
| 223 |
+
"keybert_keywords": [], # not used β kept for compatibility
|
| 224 |
"tfidf_terms": tfidf_terms,
|
| 225 |
"all_keywords_flat": all_keywords_flat,
|
| 226 |
"required_keywords": required_keywords,
|
|
|
|
| 231 |
|
| 232 |
def format_keywords_report(kw_data: Dict) -> str:
|
| 233 |
lines = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
lines.append("βββ KEYWORD EXTRACTION REPORT βββ")
|
| 235 |
+
lines.append("Methods: Regex Skills β
| TF-IDF β
\n")
|
| 236 |
|
| 237 |
ner = kw_data["ner_skills"]
|
| 238 |
if any(ner.values()):
|
| 239 |
+
lines.append("π Skills Detected:")
|
|
|
|
| 240 |
if ner.get("TECHNOLOGY"):
|
| 241 |
lines.append(f" π» Technology: {', '.join(ner['TECHNOLOGY'])}")
|
| 242 |
if ner.get("TECHNICAL"):
|
|
|
|
| 247 |
lines.append(f" π€ Soft Skills: {', '.join(ner['SOFT'])}")
|
| 248 |
lines.append("")
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
tfidf = kw_data.get("tfidf_terms", [])
|
| 251 |
if tfidf:
|
| 252 |
+
lines.append(f"π TF-IDF Key Terms: {', '.join(tfidf[:15])}")
|
| 253 |
lines.append("")
|
| 254 |
|
| 255 |
required = kw_data["required_keywords"]
|