Spaces:

aloysia98
/

ats-proof-cv-generator

Sleeping

App Files Files Community

aloysia98 commited on Apr 24

Commit

51e13a0

verified ·

1 Parent(s): 7daecb5

CRITICAL FIX: disable ML models for keyword extraction on free CPU - use regex+TF-IDF only (instant, no hanging)"

Browse files

Files changed (1) hide show

keyword_extractor.py +134 -202

keyword_extractor.py CHANGED Viewed

@@ -1,130 +1,23 @@
 """
 Component 2: JD Keyword Extractor
-3-layer hybrid extraction with graceful degradation and timeouts.
 """
 import re
-import os
 import logging
-import signal
-import traceback
 from typing import List, Dict, Tuple, Optional
-from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Timeout for individual model calls (seconds)
-MODEL_CALL_TIMEOUT = 30
-_ner_pipeline = None
-_keybert_model = None
-_models_loaded = False
-def _load_models():
-    global _ner_pipeline, _keybert_model, _models_loaded
-    if _models_loaded:
-        return
-    try:
-        logger.info("Loading NER skill extraction model...")
-        from transformers import pipeline
-        _ner_pipeline = pipeline(
-            task="token-classification",
-            model="algiraldohe/lm-ner-linkedin-skills-recognition",
-            aggregation_strategy="simple",
-            device=-1,
-        )
-        logger.info("✅ NER model loaded")
-    except Exception as e:
-        logger.warning(f"⚠️ NER model failed: {e}")
-        _ner_pipeline = None
-    try:
-        logger.info("Loading KeyBERT model...")
-        from keybert import KeyBERT
-        _keybert_model = KeyBERT(model="all-MiniLM-L6-v2")
-        logger.info("✅ KeyBERT model loaded")
-    except Exception as e:
-        logger.warning(f"⚠️ KeyBERT model failed: {e}")
-        _keybert_model = None
-    _models_loaded = True
-_load_models()
-def _run_with_timeout(fn, timeout=MODEL_CALL_TIMEOUT, default=None):
-    """Run a function with a timeout. Returns default if it times out or errors."""
-    try:
-        with ThreadPoolExecutor(max_workers=1) as executor:
-            future = executor.submit(fn)
-            return future.result(timeout=timeout)
-    except FuturesTimeoutError:
-        logger.warning(f"⚠️ {fn.__name__ if hasattr(fn, '__name__') else 'function'} timed out after {timeout}s")
-        return default
-    except Exception as e:
-        logger.warning(f"⚠️ {fn.__name__ if hasattr(fn, '__name__') else 'function'} error: {e}")
-        return default
-def extract_skills_ner(text: str, min_score: float = 0.65) -> Dict[str, List[str]]:
-    empty = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
-    if _ner_pipeline is None:
-        return empty
-    def _do_ner():
-        max_chars = 1800
-        chunks = [text[i:i + max_chars] for i in range(0, len(text), max_chars)]
-        all_entities = []
-        for chunk in chunks:
-            try:
-                entities = _ner_pipeline(chunk)
-                all_entities.extend(entities)
-            except Exception:
-                continue
-        skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
-        seen = set()
-        for ent in all_entities:
-            label = ent.get("entity_group", "")
-            word = ent.get("word", "").strip()
-            score = ent.get("score", 0)
-            word = re.sub(r'^[#\s]+|[#\s]+$', '', word)
-            word = re.sub(r'\s+', ' ', word)
-            if not word or len(word) < 2 or score < min_score:
-                continue
-            word_lower = word.lower()
-            if label in skills and word_lower not in seen:
-                seen.add(word_lower)
-                skills[label].append(word)
-        return skills
-    result = _run_with_timeout(_do_ner, timeout=MODEL_CALL_TIMEOUT, default=empty)
-    return result if result is not None else empty
-def extract_keywords_keybert(text: str, top_n: int = 20) -> List[Tuple[str, float]]:
-    if _keybert_model is None:
-        return []
-    def _do_keybert():
-        return _keybert_model.extract_keywords(
-            text,
-            keyphrase_ngram_range=(1, 3),
-            stop_words="english",
-            use_maxsum=True,
-            nr_candidates=40,
-            top_n=top_n,
-        )
-    result = _run_with_timeout(_do_keybert, timeout=MODEL_CALL_TIMEOUT, default=[])
-    return result if result is not None else []
-def extract_keywords_tfidf(text: str, top_n: int = 15) -> List[str]:
     from sklearn.feature_extraction.text import TfidfVectorizer
     import numpy as np
@@ -135,125 +28,184 @@ def extract_keywords_tfidf(text: str, top_n: int = 15) -> List[str]:
         scores = tfidf_matrix.toarray()[0]
         top_indices = np.argsort(scores)[::-1][:top_n]
         return [feature_names[i] for i in top_indices if scores[i] > 0]
-    except Exception:
         return []
 TECH_SKILLS_VOCAB = {
     "TECHNOLOGY": [
         "python", "java", "javascript", "typescript", "c\\+\\+", "c#", "ruby", "go",
         "rust", "scala", "kotlin", "swift", "php", "matlab", "julia",
-        "sql", "nosql", "html", "css", "bash", "shell",
-        "tensorflow", "pytorch", "keras", "scikit-learn",
-        "pandas", "numpy", "scipy", "matplotlib",
-        "spark", "hadoop", "kafka", "airflow", "dbt",
-        "docker", "kubernetes", "terraform", "ansible",
-        "aws", "azure", "gcp", "google cloud",
-        "react", "angular", "vue", "django", "flask", "fastapi",
-        "postgresql", "mysql", "mongodb", "redis", "elasticsearch",
-        "git", "jenkins", "github", "gitlab",
-        "tableau", "power bi", "looker", "grafana",
-        "mlflow", "kubeflow", "wandb",
-        "bert", "gpt", "llm", "llms", "langchain",
     ],
     "TECHNICAL": [
-        "machine learning", "deep learning", "neural network", "nlp",
-        "natural language processing", "computer vision", "reinforcement learning",
         "data science", "data engineering", "data analytics", "data analysis",
-        "statistical analysis", "statistics", "a/b testing",
-        "etl", "data pipeline", "data warehouse",
-        "api", "rest", "microservices", "distributed systems",
-        "cloud computing", "devops", "mlops",
-        "agile", "scrum",
-        "recommendation system", "time series", "forecasting",
-        "classification", "regression", "clustering",
-        "feature engineering", "model deployment",
-        "generative ai", "rag", "fine-tuning", "prompt engineering",
     ],
     "SOFT": [
         "leadership", "communication", "teamwork", "collaboration",
-        "problem solving", "critical thinking", "analytical",
-        "mentoring", "coaching", "presentation",
-        "stakeholder management", "project management",
-        "cross-functional", "strategic thinking",
     ],
 }
 def extract_skills_regex(text: str) -> Dict[str, List[str]]:
     text_lower = text.lower()
     skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
     seen = set()
     for category, patterns in TECH_SKILLS_VOCAB.items():
         for pattern in patterns:
             try:
-                if re.search(r'\b' + pattern + r'\b', text_lower, re.IGNORECASE):
-                    clean = pattern.replace("\\+", "+").replace("\\.", ".")
                     if clean not in seen:
                         seen.add(clean)
                         display = clean.title() if len(clean) > 3 else clean.upper()
                         skills[category].append(display)
             except Exception:
                 continue
     return skills
 def extract_all_keywords(
     job_description: str,
     resume_text: Optional[str] = None,
 ) -> Dict:
-    """Full keyword extraction with timeouts and fallbacks."""
-    logger.info("Starting keyword extraction...")
-    # Layer 1: NER (with timeout) or regex fallback
-    logger.info("  Layer 1: NER extraction...")
-    if _ner_pipeline is not None:
-        ner_skills = extract_skills_ner(job_description)
-        # If NER returned nothing, fall back to regex
-        if not any(ner_skills.values()):
-            logger.info("  NER returned empty, falling back to regex")
-            ner_skills = extract_skills_regex(job_description)
-    else:
-        ner_skills = extract_skills_regex(job_description)
-    logger.info(f"  Layer 1 done: {sum(len(v) for v in ner_skills.values())} skills found")
-    # Layer 2: KeyBERT (with timeout)
-    logger.info("  Layer 2: KeyBERT extraction...")
-    keybert_kws = extract_keywords_keybert(job_description, top_n=20)
-    logger.info(f"  Layer 2 done: {len(keybert_kws)} keywords found")
-    # Layer 3: TF-IDF (always fast)
-    logger.info("  Layer 3: TF-IDF extraction...")
     tfidf_terms = extract_keywords_tfidf(job_description, top_n=20)
-    logger.info(f"  Layer 3 done: {len(tfidf_terms)} terms found")
     # Combine
     all_kw_set = set()
     for category, skills in ner_skills.items():
         for skill in skills:
             all_kw_set.add(skill.lower())
-    for kw, score in keybert_kws:
-        all_kw_set.add(kw.lower())
     for term in tfidf_terms:
         all_kw_set.add(term.lower())
     all_keywords_flat = sorted(all_kw_set)
-    # Required keywords
     required = set()
     for category, skills in ner_skills.items():
         for skill in skills:
             required.add(skill.lower())
-    for kw, score in keybert_kws:
-        if score > 0.35:
-            required.add(kw.lower())
-    if not required and tfidf_terms:
-        for term in tfidf_terms[:10]:
-            required.add(term.lower())
     required_keywords = sorted(required)
-    # Missing from resume
     resume_keywords = []
     missing_keywords = []
     if resume_text:
@@ -264,11 +216,11 @@ def extract_all_keywords(
             else:
                 missing_keywords.append(kw)
-    logger.info(f"Keyword extraction complete: {len(all_keywords_flat)} total, {len(required_keywords)} required, {len(missing_keywords)} missing")
     return {
         "ner_skills": ner_skills,
-        "keybert_keywords": keybert_kws,
         "tfidf_terms": tfidf_terms,
         "all_keywords_flat": all_keywords_flat,
         "required_keywords": required_keywords,
@@ -279,24 +231,12 @@ def extract_all_keywords(
 def format_keywords_report(kw_data: Dict) -> str:
     lines = []
-    methods_active = []
-    if _ner_pipeline is not None:
-        methods_active.append("NER ✅")
-    else:
-        methods_active.append("NER ❌ (regex fallback)")
-    if _keybert_model is not None:
-        methods_active.append("KeyBERT ✅")
-    else:
-        methods_active.append("KeyBERT ❌")
-    methods_active.append("TF-IDF ✅")
     lines.append("═══ KEYWORD EXTRACTION REPORT ═══")
-    lines.append(f"Methods: {' | '.join(methods_active)}\n")
     ner = kw_data["ner_skills"]
     if any(ner.values()):
-        label = "🔍 Skills Detected:"
-        lines.append(label)
         if ner.get("TECHNOLOGY"):
             lines.append(f"  💻 Technology: {', '.join(ner['TECHNOLOGY'])}")
         if ner.get("TECHNICAL"):
@@ -307,17 +247,9 @@ def format_keywords_report(kw_data: Dict) -> str:
             lines.append(f"  🤝 Soft Skills: {', '.join(ner['SOFT'])}")
         lines.append("")
-    kb_kws = kw_data["keybert_keywords"]
-    if kb_kws:
-        lines.append("🔑 Key Phrases (Semantic):")
-        for kw, score in kb_kws[:10]:
-            bar = "█" * int(score * 20) + "░" * (20 - int(score * 20))
-            lines.append(f"  {bar} {score:.2f}  {kw}")
-        lines.append("")
     tfidf = kw_data.get("tfidf_terms", [])
     if tfidf:
-        lines.append(f"📊 TF-IDF Terms: {', '.join(tfidf[:15])}")
         lines.append("")
     required = kw_data["required_keywords"]

 """
 Component 2: JD Keyword Extractor
+Uses regex skill matching + TF-IDF for instant, reliable keyword extraction.
+No ML model inference during requests — works instantly on free CPU Spaces.
 """
 import re
 import logging
 from typing import List, Dict, Tuple, Optional
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# ═══════════════════════════════════════════════════════════════════════
+# TF-IDF EXTRACTION (always fast, no model needed)
+# ═══════════════════════════════════════════════════════════════════════
+def extract_keywords_tfidf(text: str, top_n: int = 20) -> List[str]:
+    """Extract important terms using TF-IDF. Always works instantly."""
     from sklearn.feature_extraction.text import TfidfVectorizer
     import numpy as np
         scores = tfidf_matrix.toarray()[0]
         top_indices = np.argsort(scores)[::-1][:top_n]
         return [feature_names[i] for i in top_indices if scores[i] > 0]
+    except Exception as e:
+        logger.warning(f"TF-IDF error: {e}")
         return []
+# ═══════════════════════════════════════════════════════════════════════
+# REGEX SKILL EXTRACTION (comprehensive vocabulary, instant)
+# ═══════════════════════════════════════════════════════════════════════
 TECH_SKILLS_VOCAB = {
     "TECHNOLOGY": [
         "python", "java", "javascript", "typescript", "c\\+\\+", "c#", "ruby", "go",
         "rust", "scala", "kotlin", "swift", "php", "matlab", "julia",
+        "sql", "nosql", "html", "css", "bash", "shell", "perl",
+        "tensorflow", "pytorch", "keras", "scikit-learn", "sklearn",
+        "pandas", "numpy", "scipy", "matplotlib", "seaborn", "plotly",
+        "spark", "pyspark", "hadoop", "kafka", "airflow", "dbt", "flink",
+        "docker", "kubernetes", "k8s", "terraform", "ansible", "helm",
+        "aws", "azure", "gcp", "google cloud", "heroku",
+        "react", "angular", "vue", "next\\.?js", "node\\.?js", "express",
+        "django", "flask", "fastapi", "spring", "rails",
+        "postgresql", "mysql", "mongodb", "redis", "elasticsearch", "cassandra",
+        "dynamodb", "snowflake", "bigquery", "redshift", "databricks",
+        "git", "jenkins", "ci/cd", "github", "gitlab", "bitbucket",
+        "tableau", "power bi", "looker", "grafana", "superset",
+        "mlflow", "kubeflow", "wandb", "dvc", "sagemaker",
+        "bert", "gpt", "llm", "llms", "langchain", "openai",
+        "hugging face", "transformers", "anthropic", "gemini",
+        "jira", "confluence", "slack", "notion",
+        "linux", "unix", "windows", "macos",
+        "excel", "powerpoint", "word",
+        "figma", "sketch", "adobe",
+        "selenium", "cypress", "jest", "pytest",
+        "nginx", "apache", "tomcat",
+        "rabbitmq", "celery", "cron",
+        "s3", "ec2", "lambda", "ecs", "eks",
+        "vpc", "iam", "cloudformation", "cdk",
     ],
     "TECHNICAL": [
+        "machine learning", "deep learning", "neural network", "neural networks",
+        "nlp", "natural language processing", "computer vision",
+        "reinforcement learning", "transfer learning", "federated learning",
         "data science", "data engineering", "data analytics", "data analysis",
+        "data modeling", "data governance", "data quality",
+        "statistical analysis", "statistics", "bayesian", "hypothesis testing",
+        "a/b testing", "experimentation", "causal inference",
+        "etl", "elt", "data pipeline", "data pipelines",
+        "data warehouse", "data lake", "data mesh", "data catalog",
+        "api", "rest", "restful", "graphql", "grpc", "websocket",
+        "microservices", "distributed systems", "event driven",
+        "cloud computing", "cloud architecture", "serverless",
+        "devops", "mlops", "dataops", "gitops", "ci/cd",
+        "agile", "scrum", "kanban", "lean",
+        "recommendation system", "recommendation engine", "recommender",
+        "search engine", "information retrieval", "ranking",
+        "time series", "forecasting", "anomaly detection", "fraud detection",
+        "classification", "regression", "clustering", "segmentation",
+        "dimensionality reduction", "feature engineering", "feature selection",
+        "model deployment", "model serving", "model monitoring",
+        "generative ai", "gen ai", "rag", "retrieval augmented",
+        "fine-tuning", "fine tuning", "prompt engineering", "few-shot",
+        "object detection", "image segmentation", "image classification",
+        "speech recognition", "text to speech", "sentiment analysis",
+        "named entity recognition", "text classification", "summarization",
+        "question answering", "chatbot", "conversational ai",
+        "optimization", "linear programming", "operations research",
+        "simulation", "monte carlo", "ab testing",
+        "web scraping", "data collection", "annotation",
+        "unit testing", "integration testing", "test driven",
+        "system design", "software architecture", "design patterns",
+        "database design", "schema design", "normalization",
+        "version control", "code review", "pair programming",
+        "containerization", "orchestration", "infrastructure as code",
+        "monitoring", "logging", "alerting", "observability",
+        "security", "encryption", "authentication", "authorization",
+        "performance optimization", "scalability", "reliability",
+        "data visualization", "dashboard", "reporting",
+        "technical writing", "documentation",
+        "research", "publications", "peer reviewed",
+    ],
+    "BUS": [
+        "business intelligence", "bi", "kpi", "roi", "revenue",
+        "product management", "product development", "product strategy",
+        "business analysis", "requirements gathering", "process improvement",
+        "customer success", "customer experience", "user experience",
+        "marketing analytics", "growth", "acquisition", "retention",
+        "financial analysis", "budgeting", "forecasting",
+        "supply chain", "logistics", "inventory",
+        "risk management", "compliance", "regulatory",
+        "consulting", "advisory", "strategy",
     ],
     "SOFT": [
         "leadership", "communication", "teamwork", "collaboration",
+        "problem solving", "problem-solving", "critical thinking",
+        "analytical", "analytical thinking",
+        "mentoring", "mentorship", "coaching",
+        "presentation", "public speaking",
+        "stakeholder management", "stakeholder engagement",
+        "project management", "program management", "time management",
+        "cross-functional", "cross functional",
+        "strategic thinking", "strategic planning",
+        "decision making", "decision-making",
+        "conflict resolution", "negotiation",
+        "adaptability", "flexibility", "resilience",
+        "creativity", "innovation", "initiative",
+        "attention to detail", "detail oriented", "detail-oriented",
+        "self-motivated", "self motivated", "proactive",
+        "interpersonal", "relationship building",
+        "emotional intelligence", "empathy",
     ],
 }
 def extract_skills_regex(text: str) -> Dict[str, List[str]]:
+    """Regex-based skill extraction. Instant, no model needed."""
     text_lower = text.lower()
     skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []}
     seen = set()
     for category, patterns in TECH_SKILLS_VOCAB.items():
         for pattern in patterns:
             try:
+                if re.search(r'(?:^|\b|[\s,;(])' + pattern + r'(?:$|\b|[\s,;)])', text_lower):
+                    clean = pattern.replace("\\+", "+").replace("\\.", ".").replace("\\?", "")
                     if clean not in seen:
                         seen.add(clean)
                         display = clean.title() if len(clean) > 3 else clean.upper()
                         skills[category].append(display)
             except Exception:
                 continue
     return skills
+# ═══════════════════════════════════════════════════════════════════════
+# MAIN EXTRACTION FUNCTION
+# ═══════════════════════════════════════════════════════════════════════
 def extract_all_keywords(
     job_description: str,
     resume_text: Optional[str] = None,
 ) -> Dict:
+    """
+    Fast keyword extraction using regex + TF-IDF.
+    No ML model inference — works instantly on any hardware.
+    """
+    logger.info("Starting keyword extraction (regex + TF-IDF)...")
+    # Layer 1: Regex skill detection (instant)
+    ner_skills = extract_skills_regex(job_description)
+    skill_count = sum(len(v) for v in ner_skills.values())
+    logger.info(f"  Regex skills: {skill_count} found")
+    # Layer 2: TF-IDF terms (instant)
     tfidf_terms = extract_keywords_tfidf(job_description, top_n=20)
+    logger.info(f"  TF-IDF terms: {len(tfidf_terms)} found")
     # Combine
     all_kw_set = set()
     for category, skills in ner_skills.items():
         for skill in skills:
             all_kw_set.add(skill.lower())
     for term in tfidf_terms:
         all_kw_set.add(term.lower())
     all_keywords_flat = sorted(all_kw_set)
+    # Required keywords = all regex-matched skills + top TF-IDF
     required = set()
     for category, skills in ner_skills.items():
         for skill in skills:
             required.add(skill.lower())
+    for term in tfidf_terms[:10]:
+        required.add(term.lower())
     required_keywords = sorted(required)
+    # Find what's missing from resume
     resume_keywords = []
     missing_keywords = []
     if resume_text:
             else:
                 missing_keywords.append(kw)
+    logger.info(f"  Total: {len(all_keywords_flat)} keywords, {len(required_keywords)} required, {len(missing_keywords)} missing")
     return {
         "ner_skills": ner_skills,
+        "keybert_keywords": [],  # not used — kept for compatibility
         "tfidf_terms": tfidf_terms,
         "all_keywords_flat": all_keywords_flat,
         "required_keywords": required_keywords,
 def format_keywords_report(kw_data: Dict) -> str:
     lines = []
     lines.append("═══ KEYWORD EXTRACTION REPORT ═══")
+    lines.append("Methods: Regex Skills ✅ | TF-IDF ✅\n")
     ner = kw_data["ner_skills"]
     if any(ner.values()):
+        lines.append("🔍 Skills Detected:")
         if ner.get("TECHNOLOGY"):
             lines.append(f"  💻 Technology: {', '.join(ner['TECHNOLOGY'])}")
         if ner.get("TECHNICAL"):
             lines.append(f"  🤝 Soft Skills: {', '.join(ner['SOFT'])}")
         lines.append("")
     tfidf = kw_data.get("tfidf_terms", [])
     if tfidf:
+        lines.append(f"📊 TF-IDF Key Terms: {', '.join(tfidf[:15])}")
         lines.append("")
     required = kw_data["required_keywords"]