""" Component 2: JD Keyword Extractor Uses regex skill matching + TF-IDF for instant, reliable keyword extraction. No ML model inference during requests — works instantly on free CPU Spaces. """ import re import logging from typing import List, Dict, Tuple, Optional logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ═══════════════════════════════════════════════════════════════════════ # TF-IDF EXTRACTION (always fast, no model needed) # ═══════════════════════════════════════════════════════════════════════ def extract_keywords_tfidf(text: str, top_n: int = 20) -> List[str]: """Extract important terms using TF-IDF. Always works instantly.""" from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np try: vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=500) tfidf_matrix = vectorizer.fit_transform([text]) feature_names = vectorizer.get_feature_names_out() scores = tfidf_matrix.toarray()[0] top_indices = np.argsort(scores)[::-1][:top_n] return [feature_names[i] for i in top_indices if scores[i] > 0] except Exception as e: logger.warning(f"TF-IDF error: {e}") return [] # ═══════════════════════════════════════════════════════════════════════ # REGEX SKILL EXTRACTION (comprehensive vocabulary, instant) # ═══════════════════════════════════════════════════════════════════════ TECH_SKILLS_VOCAB = { "TECHNOLOGY": [ "python", "java", "javascript", "typescript", "c\\+\\+", "c#", "ruby", "go", "rust", "scala", "kotlin", "swift", "php", "matlab", "julia", "sql", "nosql", "html", "css", "bash", "shell", "perl", "tensorflow", "pytorch", "keras", "scikit-learn", "sklearn", "pandas", "numpy", "scipy", "matplotlib", "seaborn", "plotly", "spark", "pyspark", "hadoop", "kafka", "airflow", "dbt", "flink", "docker", "kubernetes", "k8s", "terraform", "ansible", "helm", "aws", "azure", "gcp", "google cloud", "heroku", "react", "angular", "vue", "next\\.?js", "node\\.?js", "express", "django", "flask", "fastapi", "spring", "rails", "postgresql", "mysql", "mongodb", "redis", "elasticsearch", "cassandra", "dynamodb", "snowflake", "bigquery", "redshift", "databricks", "git", "jenkins", "ci/cd", "github", "gitlab", "bitbucket", "tableau", "power bi", "looker", "grafana", "superset", "mlflow", "kubeflow", "wandb", "dvc", "sagemaker", "bert", "gpt", "llm", "llms", "langchain", "openai", "hugging face", "transformers", "anthropic", "gemini", "jira", "confluence", "slack", "notion", "linux", "unix", "windows", "macos", "excel", "powerpoint", "word", "figma", "sketch", "adobe", "selenium", "cypress", "jest", "pytest", "nginx", "apache", "tomcat", "rabbitmq", "celery", "cron", "s3", "ec2", "lambda", "ecs", "eks", "vpc", "iam", "cloudformation", "cdk", ], "TECHNICAL": [ "machine learning", "deep learning", "neural network", "neural networks", "nlp", "natural language processing", "computer vision", "reinforcement learning", "transfer learning", "federated learning", "data science", "data engineering", "data analytics", "data analysis", "data modeling", "data governance", "data quality", "statistical analysis", "statistics", "bayesian", "hypothesis testing", "a/b testing", "experimentation", "causal inference", "etl", "elt", "data pipeline", "data pipelines", "data warehouse", "data lake", "data mesh", "data catalog", "api", "rest", "restful", "graphql", "grpc", "websocket", "microservices", "distributed systems", "event driven", "cloud computing", "cloud architecture", "serverless", "devops", "mlops", "dataops", "gitops", "ci/cd", "agile", "scrum", "kanban", "lean", "recommendation system", "recommendation engine", "recommender", "search engine", "information retrieval", "ranking", "time series", "forecasting", "anomaly detection", "fraud detection", "classification", "regression", "clustering", "segmentation", "dimensionality reduction", "feature engineering", "feature selection", "model deployment", "model serving", "model monitoring", "generative ai", "gen ai", "rag", "retrieval augmented", "fine-tuning", "fine tuning", "prompt engineering", "few-shot", "object detection", "image segmentation", "image classification", "speech recognition", "text to speech", "sentiment analysis", "named entity recognition", "text classification", "summarization", "question answering", "chatbot", "conversational ai", "optimization", "linear programming", "operations research", "simulation", "monte carlo", "ab testing", "web scraping", "data collection", "annotation", "unit testing", "integration testing", "test driven", "system design", "software architecture", "design patterns", "database design", "schema design", "normalization", "version control", "code review", "pair programming", "containerization", "orchestration", "infrastructure as code", "monitoring", "logging", "alerting", "observability", "security", "encryption", "authentication", "authorization", "performance optimization", "scalability", "reliability", "data visualization", "dashboard", "reporting", "technical writing", "documentation", "research", "publications", "peer reviewed", ], "BUS": [ "business intelligence", "bi", "kpi", "roi", "revenue", "product management", "product development", "product strategy", "business analysis", "requirements gathering", "process improvement", "customer success", "customer experience", "user experience", "marketing analytics", "growth", "acquisition", "retention", "financial analysis", "budgeting", "forecasting", "supply chain", "logistics", "inventory", "risk management", "compliance", "regulatory", "consulting", "advisory", "strategy", ], "SOFT": [ "leadership", "communication", "teamwork", "collaboration", "problem solving", "problem-solving", "critical thinking", "analytical", "analytical thinking", "mentoring", "mentorship", "coaching", "presentation", "public speaking", "stakeholder management", "stakeholder engagement", "project management", "program management", "time management", "cross-functional", "cross functional", "strategic thinking", "strategic planning", "decision making", "decision-making", "conflict resolution", "negotiation", "adaptability", "flexibility", "resilience", "creativity", "innovation", "initiative", "attention to detail", "detail oriented", "detail-oriented", "self-motivated", "self motivated", "proactive", "interpersonal", "relationship building", "emotional intelligence", "empathy", ], } def extract_skills_regex(text: str) -> Dict[str, List[str]]: """Regex-based skill extraction. Instant, no model needed.""" text_lower = text.lower() skills = {"BUS": [], "TECHNOLOGY": [], "TECHNICAL": [], "SOFT": []} seen = set() for category, patterns in TECH_SKILLS_VOCAB.items(): for pattern in patterns: try: if re.search(r'(?:^|\b|[\s,;(])' + pattern + r'(?:$|\b|[\s,;)])', text_lower): clean = pattern.replace("\\+", "+").replace("\\.", ".").replace("\\?", "") if clean not in seen: seen.add(clean) display = clean.title() if len(clean) > 3 else clean.upper() skills[category].append(display) except Exception: continue return skills # ═══════════════════════════════════════════════════════════════════════ # MAIN EXTRACTION FUNCTION # ═══════════════════════════════════════════════════════════════════════ def extract_all_keywords( job_description: str, resume_text: Optional[str] = None, ) -> Dict: """ Fast keyword extraction using regex + TF-IDF. No ML model inference — works instantly on any hardware. """ logger.info("Starting keyword extraction (regex + TF-IDF)...") # Layer 1: Regex skill detection (instant) ner_skills = extract_skills_regex(job_description) skill_count = sum(len(v) for v in ner_skills.values()) logger.info(f" Regex skills: {skill_count} found") # Layer 2: TF-IDF terms (instant) tfidf_terms = extract_keywords_tfidf(job_description, top_n=20) logger.info(f" TF-IDF terms: {len(tfidf_terms)} found") # Combine all_kw_set = set() for category, skills in ner_skills.items(): for skill in skills: all_kw_set.add(skill.lower()) for term in tfidf_terms: all_kw_set.add(term.lower()) all_keywords_flat = sorted(all_kw_set) # Required keywords = all regex-matched skills + top TF-IDF required = set() for category, skills in ner_skills.items(): for skill in skills: required.add(skill.lower()) for term in tfidf_terms[:10]: required.add(term.lower()) required_keywords = sorted(required) # Find what's missing from resume resume_keywords = [] missing_keywords = [] if resume_text: resume_text_lower = resume_text.lower() for kw in required_keywords: if kw in resume_text_lower: resume_keywords.append(kw) else: missing_keywords.append(kw) logger.info(f" Total: {len(all_keywords_flat)} keywords, {len(required_keywords)} required, {len(missing_keywords)} missing") return { "ner_skills": ner_skills, "keybert_keywords": [], # not used — kept for compatibility "tfidf_terms": tfidf_terms, "all_keywords_flat": all_keywords_flat, "required_keywords": required_keywords, "resume_keywords": resume_keywords, "missing_keywords": missing_keywords, } def format_keywords_report(kw_data: Dict) -> str: lines = [] lines.append("═══ KEYWORD EXTRACTION REPORT ═══") lines.append("Methods: Regex Skills ✅ | TF-IDF ✅\n") ner = kw_data["ner_skills"] if any(ner.values()): lines.append("🔍 Skills Detected:") if ner.get("TECHNOLOGY"): lines.append(f" 💻 Technology: {', '.join(ner['TECHNOLOGY'])}") if ner.get("TECHNICAL"): lines.append(f" 🔧 Technical: {', '.join(ner['TECHNICAL'])}") if ner.get("BUS"): lines.append(f" 📊 Business: {', '.join(ner['BUS'])}") if ner.get("SOFT"): lines.append(f" 🤝 Soft Skills: {', '.join(ner['SOFT'])}") lines.append("") tfidf = kw_data.get("tfidf_terms", []) if tfidf: lines.append(f"📊 TF-IDF Key Terms: {', '.join(tfidf[:15])}") lines.append("") required = kw_data["required_keywords"] if required: lines.append(f"⚡ Required Keywords ({len(required)}): {', '.join(required)}") lines.append("") missing = kw_data.get("missing_keywords", []) if missing: lines.append(f"❌ Missing from Resume ({len(missing)}): {', '.join(missing)}") lines.append("") present = kw_data.get("resume_keywords", []) if present: lines.append(f"✅ Already in Resume ({len(present)}): {', '.join(present)}") return "\n".join(lines)