""" Component 4: ATS Checker Hybrid scoring system: - Keyword Match Score (35%): Exact + fuzzy keyword matching - Skill Coverage Score (25%): NER-extracted skill overlap - TF-IDF Cosine Similarity (20%): Semantic document similarity - Section Completeness (10%): Standard ATS sections present - Formatting Quality (10%): ATS-friendly formatting checks Returns detailed score breakdown + actionable feedback. """ import re from typing import Dict, List, Tuple from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Standard ATS-expected sections REQUIRED_SECTIONS = ["summary", "experience", "skills", "education"] BONUS_SECTIONS = ["certifications", "projects", "awards"] # ATS formatting rules FORMAT_CHECKS = { "has_contact_info": r'[\w.+-]+@[\w-]+\.[\w.]+', # email "has_phone": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', "uses_bullet_points": r'[•\-\*]\s', "has_dates": r'\b(19|20)\d{2}\b', "no_tables_detected": None, # Inverse check "no_images_detected": None, # Inverse check (for text output always passes) } def compute_keyword_match_score( cv_text: str, required_keywords: List[str], all_keywords: List[str], ) -> Tuple[float, Dict]: """ Score keyword presence in the CV. Required keywords weighted 2x. Returns (score 0-100, details dict). """ cv_lower = cv_text.lower() # Required keyword matches req_matches = [] req_misses = [] for kw in required_keywords: if kw.lower() in cv_lower: req_matches.append(kw) else: # Fuzzy: check if any word from the keyword appears words = kw.lower().split() if len(words) > 1 and all(w in cv_lower for w in words): req_matches.append(kw + " (partial)") else: req_misses.append(kw) # All keyword matches all_matches = [] for kw in all_keywords: if kw.lower() in cv_lower: all_matches.append(kw) # Score: required matches worth 60%, general matches worth 40% req_score = (len(req_matches) / max(len(required_keywords), 1)) * 60 all_score = (len(all_matches) / max(len(all_keywords), 1)) * 40 total = min(req_score + all_score, 100) return total, { "required_matched": req_matches, "required_missing": req_misses, "total_keywords_matched": len(all_matches), "total_keywords": len(all_keywords), } def compute_skill_coverage_score( cv_text: str, jd_skills: Dict[str, List[str]], ) -> Tuple[float, Dict]: """ Score NER-extracted skill coverage. Returns (score 0-100, details dict). """ cv_lower = cv_text.lower() total_skills = 0 matched_skills = 0 matched_list = [] missing_list = [] for category, skills in jd_skills.items(): for skill in skills: total_skills += 1 if skill.lower() in cv_lower: matched_skills += 1 matched_list.append(f"{skill} ({category})") else: missing_list.append(f"{skill} ({category})") score = (matched_skills / max(total_skills, 1)) * 100 return score, { "matched": matched_list, "missing": missing_list, "coverage": f"{matched_skills}/{total_skills}", } def compute_tfidf_similarity(cv_text: str, jd_text: str) -> Tuple[float, Dict]: """ TF-IDF cosine similarity between CV and JD. Returns (score 0-100, details dict). """ try: vectorizer = TfidfVectorizer( stop_words="english", ngram_range=(1, 2), max_features=1000, ) tfidf_matrix = vectorizer.fit_transform([cv_text, jd_text]) sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] score = float(sim) * 100 # Find top shared terms feature_names = vectorizer.get_feature_names_out() cv_vec = tfidf_matrix[0].toarray()[0] jd_vec = tfidf_matrix[1].toarray()[0] # Terms important in both shared_importance = cv_vec * jd_vec top_shared_idx = np.argsort(shared_importance)[::-1][:10] top_shared = [feature_names[i] for i in top_shared_idx if shared_importance[i] > 0] return score, {"cosine_similarity": round(sim, 4), "top_shared_terms": top_shared} except Exception: return 50.0, {"cosine_similarity": 0.5, "top_shared_terms": []} def compute_section_score(cv_text: str) -> Tuple[float, Dict]: """ Check if standard ATS sections are present. Returns (score 0-100, details dict). """ cv_lower = cv_text.lower() found = [] missing = [] section_patterns = { "summary": r'\b(professional\s+summary|summary|profile|objective)\b', "experience": r'\b(experience|work\s+experience|employment)\b', "skills": r'\b(skills|technical\s+skills|competencies)\b', "education": r'\b(education|academic)\b', } for section, pattern in section_patterns.items(): if re.search(pattern, cv_lower): found.append(section) else: missing.append(section) # Check bonus sections bonus_found = [] for section in BONUS_SECTIONS: if section in cv_lower: bonus_found.append(section) # Required sections: 80% weight, bonus: 20% req_score = (len(found) / len(REQUIRED_SECTIONS)) * 80 bonus_score = (len(bonus_found) / max(len(BONUS_SECTIONS), 1)) * 20 total = min(req_score + bonus_score, 100) return total, { "required_found": found, "required_missing": missing, "bonus_found": bonus_found, } def compute_format_score(cv_text: str) -> Tuple[float, Dict]: """ Check ATS formatting quality. Returns (score 0-100, details dict). """ checks_passed = [] checks_failed = [] # Contact info if re.search(FORMAT_CHECKS["has_contact_info"], cv_text): checks_passed.append("✅ Email found") else: checks_failed.append("❌ No email detected") # Phone if re.search(FORMAT_CHECKS["has_phone"], cv_text): checks_passed.append("✅ Phone number found") else: checks_failed.append("⚠️ No phone number detected") # Bullet points if re.search(FORMAT_CHECKS["uses_bullet_points"], cv_text): checks_passed.append("✅ Uses bullet points") else: checks_failed.append("⚠️ No bullet points detected") # Dates if re.search(FORMAT_CHECKS["has_dates"], cv_text): checks_passed.append("✅ Dates present") else: checks_failed.append("⚠️ No dates detected") # Length check (too short or too long) word_count = len(cv_text.split()) if 200 <= word_count <= 1200: checks_passed.append(f"✅ Good length ({word_count} words)") elif word_count < 200: checks_failed.append(f"⚠️ Too short ({word_count} words, aim for 400+)") else: checks_failed.append(f"⚠️ Very long ({word_count} words, consider condensing)") # Single column check (no tab-heavy content) tab_count = cv_text.count('\t') if tab_count < 5: checks_passed.append("✅ Single-column layout") else: checks_failed.append("⚠️ Multiple columns detected (may break ATS)") total_checks = len(checks_passed) + len(checks_failed) score = (len(checks_passed) / max(total_checks, 1)) * 100 return score, {"passed": checks_passed, "failed": checks_failed} def run_ats_check( cv_text: str, job_description: str, keywords_data: Dict, ) -> Dict: """ Run full ATS check with weighted scoring. Weights: Keyword Match: 35% Skill Coverage: 25% TF-IDF Similarity: 20% Section Completeness: 10% Formatting: 10% Returns full report dict. """ required_kws = keywords_data.get("required_keywords", []) all_kws = keywords_data.get("all_keywords_flat", []) ner_skills = keywords_data.get("ner_skills", {}) # Run all sub-scores kw_score, kw_details = compute_keyword_match_score(cv_text, required_kws, all_kws) skill_score, skill_details = compute_skill_coverage_score(cv_text, ner_skills) tfidf_score, tfidf_details = compute_tfidf_similarity(cv_text, job_description) section_score, section_details = compute_section_score(cv_text) format_score, format_details = compute_format_score(cv_text) # Weighted total total_score = ( kw_score * 0.35 + skill_score * 0.25 + tfidf_score * 0.20 + section_score * 0.10 + format_score * 0.10 ) # Generate feedback feedback = _generate_feedback( total_score, kw_score, skill_score, tfidf_score, section_score, format_score, kw_details, skill_details, section_details, format_details, ) return { "total_score": round(total_score, 1), "breakdown": { "keyword_match": {"score": round(kw_score, 1), "weight": "35%", "details": kw_details}, "skill_coverage": {"score": round(skill_score, 1), "weight": "25%", "details": skill_details}, "tfidf_similarity": {"score": round(tfidf_score, 1), "weight": "20%", "details": tfidf_details}, "section_completeness": {"score": round(section_score, 1), "weight": "10%", "details": section_details}, "formatting": {"score": round(format_score, 1), "weight": "10%", "details": format_details}, }, "feedback": feedback, "pass": total_score >= 85, } def _generate_feedback( total, kw_score, skill_score, tfidf_score, section_score, format_score, kw_details, skill_details, section_details, format_details, ) -> List[str]: """Generate actionable feedback based on scores.""" feedback = [] if total >= 85: feedback.append("🎉 Excellent ATS compatibility! This CV should pass most ATS systems.") elif total >= 70: feedback.append("👍 Good ATS compatibility. A few improvements recommended.") else: feedback.append("⚠️ Needs improvement for ATS compatibility.") # Keyword feedback missing = kw_details.get("required_missing", []) if missing: feedback.append(f"📝 Add these missing keywords: {', '.join(missing[:8])}") # Skill feedback missing_skills = skill_details.get("missing", []) if missing_skills: feedback.append(f"🔧 Missing skills to add: {', '.join(missing_skills[:5])}") # Section feedback missing_sections = section_details.get("required_missing", []) if missing_sections: feedback.append(f"📋 Add these sections: {', '.join(s.title() for s in missing_sections)}") # Format feedback failed_formats = format_details.get("failed", []) for f in failed_formats: feedback.append(f) # TF-IDF feedback if tfidf_score < 30: feedback.append("📊 Low semantic similarity — mirror more language from the job description.") return feedback def format_ats_report(report: Dict) -> str: """Format ATS report into a human-readable string.""" lines = [] score = report["total_score"] if score >= 85: emoji = "🟢" elif score >= 70: emoji = "🟡" else: emoji = "🔴" lines.append(f"{'═' * 50}") lines.append(f" {emoji} ATS COMPATIBILITY SCORE: {score}/100") lines.append(f" {'PASS ✅' if report['pass'] else 'NEEDS IMPROVEMENT ⚠️'}") lines.append(f"{'═' * 50}\n") # Breakdown lines.append("📊 SCORE BREAKDOWN:") for name, data in report["breakdown"].items(): bar_len = int(data["score"] / 5) bar = "█" * bar_len + "░" * (20 - bar_len) display_name = name.replace("_", " ").title() lines.append(f" {bar} {data['score']:5.1f} {display_name} ({data['weight']})") lines.append("") # Detailed feedback lines.append("💡 FEEDBACK:") for fb in report["feedback"]: lines.append(f" {fb}") lines.append("") # Keyword details kw = report["breakdown"]["keyword_match"]["details"] if kw.get("required_missing"): lines.append(f" 🔑 Missing Keywords: {', '.join(kw['required_missing'][:10])}") if kw.get("required_matched"): matched = [m for m in kw["required_matched"] if "(partial)" not in m] if matched: lines.append(f" ✅ Matched Keywords: {', '.join(matched[:10])}") return "\n".join(lines)