aloysia98 commited on
Commit
51e13a0
Β·
verified Β·
1 Parent(s): 7daecb5

CRITICAL FIX: disable ML models for keyword extraction on free CPU - use regex+TF-IDF only (instant, no hanging)"

Browse files
Files changed (1) hide show
  1. keyword_extractor.py +134 -202
keyword_extractor.py CHANGED
@@ -1,130 +1,23 @@
1
  """
2
  Component 2: JD Keyword Extractor
3
- 3-layer hybrid extraction with graceful degradation and timeouts.
 
4
  """
5
 
6
  import re
7
- import os
8
  import logging
9
- import signal
10
- import traceback
11
  from typing import List, Dict, Tuple, Optional
12
- from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
13
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
- # Timeout for individual model calls (seconds)
18
- MODEL_CALL_TIMEOUT = 30
19
 
20
- _ner_pipeline = None
21
- _keybert_model = None
22
- _models_loaded = False
23
 
24
-
25
- def _load_models():
26
- global _ner_pipeline, _keybert_model, _models_loaded
27
- if _models_loaded:
28
- return
29
-
30
- try:
31
- logger.info("Loading NER skill extraction model...")
32
- from transformers import pipeline
33
- _ner_pipeline = pipeline(
34
- task="token-classification",
35
- model="algiraldohe/lm-ner-linkedin-skills-recognition",
36
- aggregation_strategy="simple",
37
- device=-1,
38
- )
39
- logger.info("βœ… NER model loaded")
40
- except Exception as e:
41
- logger.warning(f"⚠️ NER model failed: {e}")
42
- _ner_pipeline = None
43
-
44
- try:
45
- logger.info("Loading KeyBERT model...")
46
- from keybert import KeyBERT
47
- _keybert_model = KeyBERT(model="all-MiniLM-L6-v2")
48
- logger.info("βœ… KeyBERT model loaded")
49
- except Exception as e:
50
- logger.warning(f"⚠️ KeyBERT model failed: {e}")
51
- _keybert_model = None
52
-
53
- _models_loaded = True
54
-
55
-
56
- _load_models()
57
-
58
-
59
- def _run_with_timeout(fn, timeout=MODEL_CALL_TIMEOUT, default=None):
60
- """Run a function with a timeout. Returns default if it times out or errors."""
61
- try:
62
- with ThreadPoolExecutor(max_workers=1) as executor:
63
- future = executor.submit(fn)
64
- return future.result(timeout=timeout)
65
- except FuturesTimeoutError:
66
- logger.warning(f"⚠️ {fn.__name__ if hasattr(fn, '__name__') else 'function'} timed out after {timeout}s")
67
- return default
68
- except Exception as e:
69
- logger.warning(f"⚠️ {fn.__name__ if hasattr(fn, '__name__') else 'function'} error: {e}")
70
- return default
71
-
72
-
73
- def extract_skills_ner(text: str, min_score: float = 0.65) -> Dict[str, List[str]]:
74
- empty = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
75
- if _ner_pipeline is None:
76
- return empty
77
-
78
- def _do_ner():
79
- max_chars = 1800
80
- chunks = [text[i:i + max_chars] for i in range(0, len(text), max_chars)]
81
- all_entities = []
82
- for chunk in chunks:
83
- try:
84
- entities = _ner_pipeline(chunk)
85
- all_entities.extend(entities)
86
- except Exception:
87
- continue
88
-
89
- skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
90
- seen = set()
91
- for ent in all_entities:
92
- label = ent.get("entity_group", "")
93
- word = ent.get("word", "").strip()
94
- score = ent.get("score", 0)
95
- word = re.sub(r'^[#\s]+|[#\s]+$', '', word)
96
- word = re.sub(r'\s+', ' ', word)
97
- if not word or len(word) < 2 or score < min_score:
98
- continue
99
- word_lower = word.lower()
100
- if label in skills and word_lower not in seen:
101
- seen.add(word_lower)
102
- skills[label].append(word)
103
- return skills
104
-
105
- result = _run_with_timeout(_do_ner, timeout=MODEL_CALL_TIMEOUT, default=empty)
106
- return result if result is not None else empty
107
-
108
-
109
- def extract_keywords_keybert(text: str, top_n: int = 20) -> List[Tuple[str, float]]:
110
- if _keybert_model is None:
111
- return []
112
-
113
- def _do_keybert():
114
- return _keybert_model.extract_keywords(
115
- text,
116
- keyphrase_ngram_range=(1, 3),
117
- stop_words="english",
118
- use_maxsum=True,
119
- nr_candidates=40,
120
- top_n=top_n,
121
- )
122
-
123
- result = _run_with_timeout(_do_keybert, timeout=MODEL_CALL_TIMEOUT, default=[])
124
- return result if result is not None else []
125
-
126
-
127
- def extract_keywords_tfidf(text: str, top_n: int = 15) -> List[str]:
128
  from sklearn.feature_extraction.text import TfidfVectorizer
129
  import numpy as np
130
 
@@ -135,125 +28,184 @@ def extract_keywords_tfidf(text: str, top_n: int = 15) -> List[str]:
135
  scores = tfidf_matrix.toarray()[0]
136
  top_indices = np.argsort(scores)[::-1][:top_n]
137
  return [feature_names[i] for i in top_indices if scores[i] > 0]
138
- except Exception:
 
139
  return []
140
 
141
 
 
 
 
 
142
  TECH_SKILLS_VOCAB = {
143
  "TECHNOLOGY": [
144
  "python", "java", "javascript", "typescript", "c\\+\\+", "c#", "ruby", "go",
145
  "rust", "scala", "kotlin", "swift", "php", "matlab", "julia",
146
- "sql", "nosql", "html", "css", "bash", "shell",
147
- "tensorflow", "pytorch", "keras", "scikit-learn",
148
- "pandas", "numpy", "scipy", "matplotlib",
149
- "spark", "hadoop", "kafka", "airflow", "dbt",
150
- "docker", "kubernetes", "terraform", "ansible",
151
- "aws", "azure", "gcp", "google cloud",
152
- "react", "angular", "vue", "django", "flask", "fastapi",
153
- "postgresql", "mysql", "mongodb", "redis", "elasticsearch",
154
- "git", "jenkins", "github", "gitlab",
155
- "tableau", "power bi", "looker", "grafana",
156
- "mlflow", "kubeflow", "wandb",
157
- "bert", "gpt", "llm", "llms", "langchain",
 
 
 
 
 
 
 
 
 
 
 
 
158
  ],
159
  "TECHNICAL": [
160
- "machine learning", "deep learning", "neural network", "nlp",
161
- "natural language processing", "computer vision", "reinforcement learning",
 
162
  "data science", "data engineering", "data analytics", "data analysis",
163
- "statistical analysis", "statistics", "a/b testing",
164
- "etl", "data pipeline", "data warehouse",
165
- "api", "rest", "microservices", "distributed systems",
166
- "cloud computing", "devops", "mlops",
167
- "agile", "scrum",
168
- "recommendation system", "time series", "forecasting",
169
- "classification", "regression", "clustering",
170
- "feature engineering", "model deployment",
171
- "generative ai", "rag", "fine-tuning", "prompt engineering",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  ],
173
  "SOFT": [
174
  "leadership", "communication", "teamwork", "collaboration",
175
- "problem solving", "critical thinking", "analytical",
176
- "mentoring", "coaching", "presentation",
177
- "stakeholder management", "project management",
178
- "cross-functional", "strategic thinking",
 
 
 
 
 
 
 
 
 
 
 
 
179
  ],
180
  }
181
 
182
 
183
  def extract_skills_regex(text: str) -> Dict[str, List[str]]:
 
184
  text_lower = text.lower()
185
  skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
186
  seen = set()
 
187
  for category, patterns in TECH_SKILLS_VOCAB.items():
188
  for pattern in patterns:
189
  try:
190
- if re.search(r'\b' + pattern + r'\b', text_lower, re.IGNORECASE):
191
- clean = pattern.replace("\\+", "+").replace("\\.", ".")
192
  if clean not in seen:
193
  seen.add(clean)
194
  display = clean.title() if len(clean) > 3 else clean.upper()
195
  skills[category].append(display)
196
  except Exception:
197
  continue
 
198
  return skills
199
 
200
 
 
 
 
 
201
  def extract_all_keywords(
202
  job_description: str,
203
  resume_text: Optional[str] = None,
204
  ) -> Dict:
205
- """Full keyword extraction with timeouts and fallbacks."""
206
- logger.info("Starting keyword extraction...")
207
-
208
- # Layer 1: NER (with timeout) or regex fallback
209
- logger.info(" Layer 1: NER extraction...")
210
- if _ner_pipeline is not None:
211
- ner_skills = extract_skills_ner(job_description)
212
- # If NER returned nothing, fall back to regex
213
- if not any(ner_skills.values()):
214
- logger.info(" NER returned empty, falling back to regex")
215
- ner_skills = extract_skills_regex(job_description)
216
- else:
217
- ner_skills = extract_skills_regex(job_description)
218
- logger.info(f" Layer 1 done: {sum(len(v) for v in ner_skills.values())} skills found")
219
-
220
- # Layer 2: KeyBERT (with timeout)
221
- logger.info(" Layer 2: KeyBERT extraction...")
222
- keybert_kws = extract_keywords_keybert(job_description, top_n=20)
223
- logger.info(f" Layer 2 done: {len(keybert_kws)} keywords found")
224
-
225
- # Layer 3: TF-IDF (always fast)
226
- logger.info(" Layer 3: TF-IDF extraction...")
227
  tfidf_terms = extract_keywords_tfidf(job_description, top_n=20)
228
- logger.info(f" Layer 3 done: {len(tfidf_terms)} terms found")
229
 
230
  # Combine
231
  all_kw_set = set()
232
  for category, skills in ner_skills.items():
233
  for skill in skills:
234
  all_kw_set.add(skill.lower())
235
- for kw, score in keybert_kws:
236
- all_kw_set.add(kw.lower())
237
  for term in tfidf_terms:
238
  all_kw_set.add(term.lower())
239
 
240
  all_keywords_flat = sorted(all_kw_set)
241
 
242
- # Required keywords
243
  required = set()
244
  for category, skills in ner_skills.items():
245
  for skill in skills:
246
  required.add(skill.lower())
247
- for kw, score in keybert_kws:
248
- if score > 0.35:
249
- required.add(kw.lower())
250
- if not required and tfidf_terms:
251
- for term in tfidf_terms[:10]:
252
- required.add(term.lower())
253
 
254
  required_keywords = sorted(required)
255
 
256
- # Missing from resume
257
  resume_keywords = []
258
  missing_keywords = []
259
  if resume_text:
@@ -264,11 +216,11 @@ def extract_all_keywords(
264
  else:
265
  missing_keywords.append(kw)
266
 
267
- logger.info(f"Keyword extraction complete: {len(all_keywords_flat)} total, {len(required_keywords)} required, {len(missing_keywords)} missing")
268
 
269
  return {
270
  "ner_skills": ner_skills,
271
- "keybert_keywords": keybert_kws,
272
  "tfidf_terms": tfidf_terms,
273
  "all_keywords_flat": all_keywords_flat,
274
  "required_keywords": required_keywords,
@@ -279,24 +231,12 @@ def extract_all_keywords(
279
 
280
  def format_keywords_report(kw_data: Dict) -> str:
281
  lines = []
282
- methods_active = []
283
- if _ner_pipeline is not None:
284
- methods_active.append("NER βœ…")
285
- else:
286
- methods_active.append("NER ❌ (regex fallback)")
287
- if _keybert_model is not None:
288
- methods_active.append("KeyBERT βœ…")
289
- else:
290
- methods_active.append("KeyBERT ❌")
291
- methods_active.append("TF-IDF βœ…")
292
-
293
  lines.append("═══ KEYWORD EXTRACTION REPORT ═══")
294
- lines.append(f"Methods: {' | '.join(methods_active)}\n")
295
 
296
  ner = kw_data["ner_skills"]
297
  if any(ner.values()):
298
- label = "πŸ” Skills Detected:"
299
- lines.append(label)
300
  if ner.get("TECHNOLOGY"):
301
  lines.append(f" πŸ’» Technology: {', '.join(ner['TECHNOLOGY'])}")
302
  if ner.get("TECHNICAL"):
@@ -307,17 +247,9 @@ def format_keywords_report(kw_data: Dict) -> str:
307
  lines.append(f" 🀝 Soft Skills: {', '.join(ner['SOFT'])}")
308
  lines.append("")
309
 
310
- kb_kws = kw_data["keybert_keywords"]
311
- if kb_kws:
312
- lines.append("πŸ”‘ Key Phrases (Semantic):")
313
- for kw, score in kb_kws[:10]:
314
- bar = "β–ˆ" * int(score * 20) + "β–‘" * (20 - int(score * 20))
315
- lines.append(f" {bar} {score:.2f} {kw}")
316
- lines.append("")
317
-
318
  tfidf = kw_data.get("tfidf_terms", [])
319
  if tfidf:
320
- lines.append(f"πŸ“Š TF-IDF Terms: {', '.join(tfidf[:15])}")
321
  lines.append("")
322
 
323
  required = kw_data["required_keywords"]
 
1
  """
2
  Component 2: JD Keyword Extractor
3
+ Uses regex skill matching + TF-IDF for instant, reliable keyword extraction.
4
+ No ML model inference during requests β€” works instantly on free CPU Spaces.
5
  """
6
 
7
  import re
 
8
  import logging
 
 
9
  from typing import List, Dict, Tuple, Optional
 
10
 
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
 
 
14
 
15
+ # ═══════════════════════════════════════════════════════════════════════
16
+ # TF-IDF EXTRACTION (always fast, no model needed)
17
+ # ═══════════════════════════════════════════════════════════════════════
18
 
19
+ def extract_keywords_tfidf(text: str, top_n: int = 20) -> List[str]:
20
+ """Extract important terms using TF-IDF. Always works instantly."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  from sklearn.feature_extraction.text import TfidfVectorizer
22
  import numpy as np
23
 
 
28
  scores = tfidf_matrix.toarray()[0]
29
  top_indices = np.argsort(scores)[::-1][:top_n]
30
  return [feature_names[i] for i in top_indices if scores[i] > 0]
31
+ except Exception as e:
32
+ logger.warning(f"TF-IDF error: {e}")
33
  return []
34
 
35
 
36
+ # ═══════════════════════════════════════════════════════════════════════
37
+ # REGEX SKILL EXTRACTION (comprehensive vocabulary, instant)
38
+ # ═══════════════════════════════════════════════════════════════════════
39
+
40
  TECH_SKILLS_VOCAB = {
41
  "TECHNOLOGY": [
42
  "python", "java", "javascript", "typescript", "c\\+\\+", "c#", "ruby", "go",
43
  "rust", "scala", "kotlin", "swift", "php", "matlab", "julia",
44
+ "sql", "nosql", "html", "css", "bash", "shell", "perl",
45
+ "tensorflow", "pytorch", "keras", "scikit-learn", "sklearn",
46
+ "pandas", "numpy", "scipy", "matplotlib", "seaborn", "plotly",
47
+ "spark", "pyspark", "hadoop", "kafka", "airflow", "dbt", "flink",
48
+ "docker", "kubernetes", "k8s", "terraform", "ansible", "helm",
49
+ "aws", "azure", "gcp", "google cloud", "heroku",
50
+ "react", "angular", "vue", "next\\.?js", "node\\.?js", "express",
51
+ "django", "flask", "fastapi", "spring", "rails",
52
+ "postgresql", "mysql", "mongodb", "redis", "elasticsearch", "cassandra",
53
+ "dynamodb", "snowflake", "bigquery", "redshift", "databricks",
54
+ "git", "jenkins", "ci/cd", "github", "gitlab", "bitbucket",
55
+ "tableau", "power bi", "looker", "grafana", "superset",
56
+ "mlflow", "kubeflow", "wandb", "dvc", "sagemaker",
57
+ "bert", "gpt", "llm", "llms", "langchain", "openai",
58
+ "hugging face", "transformers", "anthropic", "gemini",
59
+ "jira", "confluence", "slack", "notion",
60
+ "linux", "unix", "windows", "macos",
61
+ "excel", "powerpoint", "word",
62
+ "figma", "sketch", "adobe",
63
+ "selenium", "cypress", "jest", "pytest",
64
+ "nginx", "apache", "tomcat",
65
+ "rabbitmq", "celery", "cron",
66
+ "s3", "ec2", "lambda", "ecs", "eks",
67
+ "vpc", "iam", "cloudformation", "cdk",
68
  ],
69
  "TECHNICAL": [
70
+ "machine learning", "deep learning", "neural network", "neural networks",
71
+ "nlp", "natural language processing", "computer vision",
72
+ "reinforcement learning", "transfer learning", "federated learning",
73
  "data science", "data engineering", "data analytics", "data analysis",
74
+ "data modeling", "data governance", "data quality",
75
+ "statistical analysis", "statistics", "bayesian", "hypothesis testing",
76
+ "a/b testing", "experimentation", "causal inference",
77
+ "etl", "elt", "data pipeline", "data pipelines",
78
+ "data warehouse", "data lake", "data mesh", "data catalog",
79
+ "api", "rest", "restful", "graphql", "grpc", "websocket",
80
+ "microservices", "distributed systems", "event driven",
81
+ "cloud computing", "cloud architecture", "serverless",
82
+ "devops", "mlops", "dataops", "gitops", "ci/cd",
83
+ "agile", "scrum", "kanban", "lean",
84
+ "recommendation system", "recommendation engine", "recommender",
85
+ "search engine", "information retrieval", "ranking",
86
+ "time series", "forecasting", "anomaly detection", "fraud detection",
87
+ "classification", "regression", "clustering", "segmentation",
88
+ "dimensionality reduction", "feature engineering", "feature selection",
89
+ "model deployment", "model serving", "model monitoring",
90
+ "generative ai", "gen ai", "rag", "retrieval augmented",
91
+ "fine-tuning", "fine tuning", "prompt engineering", "few-shot",
92
+ "object detection", "image segmentation", "image classification",
93
+ "speech recognition", "text to speech", "sentiment analysis",
94
+ "named entity recognition", "text classification", "summarization",
95
+ "question answering", "chatbot", "conversational ai",
96
+ "optimization", "linear programming", "operations research",
97
+ "simulation", "monte carlo", "ab testing",
98
+ "web scraping", "data collection", "annotation",
99
+ "unit testing", "integration testing", "test driven",
100
+ "system design", "software architecture", "design patterns",
101
+ "database design", "schema design", "normalization",
102
+ "version control", "code review", "pair programming",
103
+ "containerization", "orchestration", "infrastructure as code",
104
+ "monitoring", "logging", "alerting", "observability",
105
+ "security", "encryption", "authentication", "authorization",
106
+ "performance optimization", "scalability", "reliability",
107
+ "data visualization", "dashboard", "reporting",
108
+ "technical writing", "documentation",
109
+ "research", "publications", "peer reviewed",
110
+ ],
111
+ "BUS": [
112
+ "business intelligence", "bi", "kpi", "roi", "revenue",
113
+ "product management", "product development", "product strategy",
114
+ "business analysis", "requirements gathering", "process improvement",
115
+ "customer success", "customer experience", "user experience",
116
+ "marketing analytics", "growth", "acquisition", "retention",
117
+ "financial analysis", "budgeting", "forecasting",
118
+ "supply chain", "logistics", "inventory",
119
+ "risk management", "compliance", "regulatory",
120
+ "consulting", "advisory", "strategy",
121
  ],
122
  "SOFT": [
123
  "leadership", "communication", "teamwork", "collaboration",
124
+ "problem solving", "problem-solving", "critical thinking",
125
+ "analytical", "analytical thinking",
126
+ "mentoring", "mentorship", "coaching",
127
+ "presentation", "public speaking",
128
+ "stakeholder management", "stakeholder engagement",
129
+ "project management", "program management", "time management",
130
+ "cross-functional", "cross functional",
131
+ "strategic thinking", "strategic planning",
132
+ "decision making", "decision-making",
133
+ "conflict resolution", "negotiation",
134
+ "adaptability", "flexibility", "resilience",
135
+ "creativity", "innovation", "initiative",
136
+ "attention to detail", "detail oriented", "detail-oriented",
137
+ "self-motivated", "self motivated", "proactive",
138
+ "interpersonal", "relationship building",
139
+ "emotional intelligence", "empathy",
140
  ],
141
  }
142
 
143
 
144
  def extract_skills_regex(text: str) -> Dict[str, List[str]]:
145
+ """Regex-based skill extraction. Instant, no model needed."""
146
  text_lower = text.lower()
147
  skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
148
  seen = set()
149
+
150
  for category, patterns in TECH_SKILLS_VOCAB.items():
151
  for pattern in patterns:
152
  try:
153
+ if re.search(r'(?:^|\b|[\s,;(])' + pattern + r'(?:$|\b|[\s,;)])', text_lower):
154
+ clean = pattern.replace("\\+", "+").replace("\\.", ".").replace("\\?", "")
155
  if clean not in seen:
156
  seen.add(clean)
157
  display = clean.title() if len(clean) > 3 else clean.upper()
158
  skills[category].append(display)
159
  except Exception:
160
  continue
161
+
162
  return skills
163
 
164
 
165
+ # ═══════════════════════════════════════════════════════════════════════
166
+ # MAIN EXTRACTION FUNCTION
167
+ # ═══════════════════════════════════════════════════════════════════════
168
+
169
  def extract_all_keywords(
170
  job_description: str,
171
  resume_text: Optional[str] = None,
172
  ) -> Dict:
173
+ """
174
+ Fast keyword extraction using regex + TF-IDF.
175
+ No ML model inference β€” works instantly on any hardware.
176
+ """
177
+ logger.info("Starting keyword extraction (regex + TF-IDF)...")
178
+
179
+ # Layer 1: Regex skill detection (instant)
180
+ ner_skills = extract_skills_regex(job_description)
181
+ skill_count = sum(len(v) for v in ner_skills.values())
182
+ logger.info(f" Regex skills: {skill_count} found")
183
+
184
+ # Layer 2: TF-IDF terms (instant)
 
 
 
 
 
 
 
 
 
 
185
  tfidf_terms = extract_keywords_tfidf(job_description, top_n=20)
186
+ logger.info(f" TF-IDF terms: {len(tfidf_terms)} found")
187
 
188
  # Combine
189
  all_kw_set = set()
190
  for category, skills in ner_skills.items():
191
  for skill in skills:
192
  all_kw_set.add(skill.lower())
 
 
193
  for term in tfidf_terms:
194
  all_kw_set.add(term.lower())
195
 
196
  all_keywords_flat = sorted(all_kw_set)
197
 
198
+ # Required keywords = all regex-matched skills + top TF-IDF
199
  required = set()
200
  for category, skills in ner_skills.items():
201
  for skill in skills:
202
  required.add(skill.lower())
203
+ for term in tfidf_terms[:10]:
204
+ required.add(term.lower())
 
 
 
 
205
 
206
  required_keywords = sorted(required)
207
 
208
+ # Find what's missing from resume
209
  resume_keywords = []
210
  missing_keywords = []
211
  if resume_text:
 
216
  else:
217
  missing_keywords.append(kw)
218
 
219
+ logger.info(f" Total: {len(all_keywords_flat)} keywords, {len(required_keywords)} required, {len(missing_keywords)} missing")
220
 
221
  return {
222
  "ner_skills": ner_skills,
223
+ "keybert_keywords": [], # not used β€” kept for compatibility
224
  "tfidf_terms": tfidf_terms,
225
  "all_keywords_flat": all_keywords_flat,
226
  "required_keywords": required_keywords,
 
231
 
232
  def format_keywords_report(kw_data: Dict) -> str:
233
  lines = []
 
 
 
 
 
 
 
 
 
 
 
234
  lines.append("═══ KEYWORD EXTRACTION REPORT ═══")
235
+ lines.append("Methods: Regex Skills βœ… | TF-IDF βœ…\n")
236
 
237
  ner = kw_data["ner_skills"]
238
  if any(ner.values()):
239
+ lines.append("πŸ” Skills Detected:")
 
240
  if ner.get("TECHNOLOGY"):
241
  lines.append(f" πŸ’» Technology: {', '.join(ner['TECHNOLOGY'])}")
242
  if ner.get("TECHNICAL"):
 
247
  lines.append(f" 🀝 Soft Skills: {', '.join(ner['SOFT'])}")
248
  lines.append("")
249
 
 
 
 
 
 
 
 
 
250
  tfidf = kw_data.get("tfidf_terms", [])
251
  if tfidf:
252
+ lines.append(f"πŸ“Š TF-IDF Key Terms: {', '.join(tfidf[:15])}")
253
  lines.append("")
254
 
255
  required = kw_data["required_keywords"]