"""
Component 4: ATS Checker
Hybrid scoring system:
  - Keyword Match Score (35%): Exact + fuzzy keyword matching
  - Skill Coverage Score (25%): NER-extracted skill overlap
  - TF-IDF Cosine Similarity (20%): Semantic document similarity
  - Section Completeness (10%): Standard ATS sections present
  - Formatting Quality (10%): ATS-friendly formatting checks

Returns detailed score breakdown + actionable feedback.
"""

import re
from typing import Dict, List, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Standard ATS-expected sections
REQUIRED_SECTIONS = ["summary", "experience", "skills", "education"]
BONUS_SECTIONS = ["certifications", "projects", "awards"]

# ATS formatting rules
FORMAT_CHECKS = {
    "has_contact_info": r'[\w.+-]+@[\w-]+\.[\w.]+',  # email
    "has_phone": r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
    "uses_bullet_points": r'[•\-\*]\s',
    "has_dates": r'\b(19|20)\d{2}\b',
    "no_tables_detected": None,  # Inverse check
    "no_images_detected": None,  # Inverse check (for text output always passes)
}


def compute_keyword_match_score(
    cv_text: str,
    required_keywords: List[str],
    all_keywords: List[str],
) -> Tuple[float, Dict]:
    """
    Score keyword presence in the CV.
    Required keywords weighted 2x.
    Returns (score 0-100, details dict).
    """
    cv_lower = cv_text.lower()

    # Required keyword matches
    req_matches = []
    req_misses = []
    for kw in required_keywords:
        if kw.lower() in cv_lower:
            req_matches.append(kw)
        else:
            # Fuzzy: check if any word from the keyword appears
            words = kw.lower().split()
            if len(words) > 1 and all(w in cv_lower for w in words):
                req_matches.append(kw + " (partial)")
            else:
                req_misses.append(kw)

    # All keyword matches
    all_matches = []
    for kw in all_keywords:
        if kw.lower() in cv_lower:
            all_matches.append(kw)

    # Score: required matches worth 60%, general matches worth 40%
    req_score = (len(req_matches) / max(len(required_keywords), 1)) * 60
    all_score = (len(all_matches) / max(len(all_keywords), 1)) * 40
    total = min(req_score + all_score, 100)

    return total, {
        "required_matched": req_matches,
        "required_missing": req_misses,
        "total_keywords_matched": len(all_matches),
        "total_keywords": len(all_keywords),
    }


def compute_skill_coverage_score(
    cv_text: str,
    jd_skills: Dict[str, List[str]],
) -> Tuple[float, Dict]:
    """
    Score NER-extracted skill coverage.
    Returns (score 0-100, details dict).
    """
    cv_lower = cv_text.lower()
    total_skills = 0
    matched_skills = 0
    matched_list = []
    missing_list = []

    for category, skills in jd_skills.items():
        for skill in skills:
            total_skills += 1
            if skill.lower() in cv_lower:
                matched_skills += 1
                matched_list.append(f"{skill} ({category})")
            else:
                missing_list.append(f"{skill} ({category})")

    score = (matched_skills / max(total_skills, 1)) * 100

    return score, {
        "matched": matched_list,
        "missing": missing_list,
        "coverage": f"{matched_skills}/{total_skills}",
    }


def compute_tfidf_similarity(cv_text: str, jd_text: str) -> Tuple[float, Dict]:
    """
    TF-IDF cosine similarity between CV and JD.
    Returns (score 0-100, details dict).
    """
    try:
        vectorizer = TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=1000,
        )
        tfidf_matrix = vectorizer.fit_transform([cv_text, jd_text])
        sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        score = float(sim) * 100

        # Find top shared terms
        feature_names = vectorizer.get_feature_names_out()
        cv_vec = tfidf_matrix[0].toarray()[0]
        jd_vec = tfidf_matrix[1].toarray()[0]

        # Terms important in both
        shared_importance = cv_vec * jd_vec
        top_shared_idx = np.argsort(shared_importance)[::-1][:10]
        top_shared = [feature_names[i] for i in top_shared_idx if shared_importance[i] > 0]

        return score, {"cosine_similarity": round(sim, 4), "top_shared_terms": top_shared}
    except Exception:
        return 50.0, {"cosine_similarity": 0.5, "top_shared_terms": []}


def compute_section_score(cv_text: str) -> Tuple[float, Dict]:
    """
    Check if standard ATS sections are present.
    Returns (score 0-100, details dict).
    """
    cv_lower = cv_text.lower()
    found = []
    missing = []

    section_patterns = {
        "summary": r'\b(professional\s+summary|summary|profile|objective)\b',
        "experience": r'\b(experience|work\s+experience|employment)\b',
        "skills": r'\b(skills|technical\s+skills|competencies)\b',
        "education": r'\b(education|academic)\b',
    }

    for section, pattern in section_patterns.items():
        if re.search(pattern, cv_lower):
            found.append(section)
        else:
            missing.append(section)

    # Check bonus sections
    bonus_found = []
    for section in BONUS_SECTIONS:
        if section in cv_lower:
            bonus_found.append(section)

    # Required sections: 80% weight, bonus: 20%
    req_score = (len(found) / len(REQUIRED_SECTIONS)) * 80
    bonus_score = (len(bonus_found) / max(len(BONUS_SECTIONS), 1)) * 20
    total = min(req_score + bonus_score, 100)

    return total, {
        "required_found": found,
        "required_missing": missing,
        "bonus_found": bonus_found,
    }


def compute_format_score(cv_text: str) -> Tuple[float, Dict]:
    """
    Check ATS formatting quality.
    Returns (score 0-100, details dict).
    """
    checks_passed = []
    checks_failed = []

    # Contact info
    if re.search(FORMAT_CHECKS["has_contact_info"], cv_text):
        checks_passed.append("✅ Email found")
    else:
        checks_failed.append("❌ No email detected")

    # Phone
    if re.search(FORMAT_CHECKS["has_phone"], cv_text):
        checks_passed.append("✅ Phone number found")
    else:
        checks_failed.append("⚠️ No phone number detected")

    # Bullet points
    if re.search(FORMAT_CHECKS["uses_bullet_points"], cv_text):
        checks_passed.append("✅ Uses bullet points")
    else:
        checks_failed.append("⚠️ No bullet points detected")

    # Dates
    if re.search(FORMAT_CHECKS["has_dates"], cv_text):
        checks_passed.append("✅ Dates present")
    else:
        checks_failed.append("⚠️ No dates detected")

    # Length check (too short or too long)
    word_count = len(cv_text.split())
    if 200 <= word_count <= 1200:
        checks_passed.append(f"✅ Good length ({word_count} words)")
    elif word_count < 200:
        checks_failed.append(f"⚠️ Too short ({word_count} words, aim for 400+)")
    else:
        checks_failed.append(f"⚠️ Very long ({word_count} words, consider condensing)")

    # Single column check (no tab-heavy content)
    tab_count = cv_text.count('\t')
    if tab_count < 5:
        checks_passed.append("✅ Single-column layout")
    else:
        checks_failed.append("⚠️ Multiple columns detected (may break ATS)")

    total_checks = len(checks_passed) + len(checks_failed)
    score = (len(checks_passed) / max(total_checks, 1)) * 100

    return score, {"passed": checks_passed, "failed": checks_failed}


def run_ats_check(
    cv_text: str,
    job_description: str,
    keywords_data: Dict,
) -> Dict:
    """
    Run full ATS check with weighted scoring.

    Weights:
      Keyword Match: 35%
      Skill Coverage: 25%
      TF-IDF Similarity: 20%
      Section Completeness: 10%
      Formatting: 10%

    Returns full report dict.
    """
    required_kws = keywords_data.get("required_keywords", [])
    all_kws = keywords_data.get("all_keywords_flat", [])
    ner_skills = keywords_data.get("ner_skills", {})

    # Run all sub-scores
    kw_score, kw_details = compute_keyword_match_score(cv_text, required_kws, all_kws)
    skill_score, skill_details = compute_skill_coverage_score(cv_text, ner_skills)
    tfidf_score, tfidf_details = compute_tfidf_similarity(cv_text, job_description)
    section_score, section_details = compute_section_score(cv_text)
    format_score, format_details = compute_format_score(cv_text)

    # Weighted total
    total_score = (
        kw_score * 0.35
        + skill_score * 0.25
        + tfidf_score * 0.20
        + section_score * 0.10
        + format_score * 0.10
    )

    # Generate feedback
    feedback = _generate_feedback(
        total_score, kw_score, skill_score, tfidf_score,
        section_score, format_score,
        kw_details, skill_details, section_details, format_details,
    )

    return {
        "total_score": round(total_score, 1),
        "breakdown": {
            "keyword_match": {"score": round(kw_score, 1), "weight": "35%", "details": kw_details},
            "skill_coverage": {"score": round(skill_score, 1), "weight": "25%", "details": skill_details},
            "tfidf_similarity": {"score": round(tfidf_score, 1), "weight": "20%", "details": tfidf_details},
            "section_completeness": {"score": round(section_score, 1), "weight": "10%", "details": section_details},
            "formatting": {"score": round(format_score, 1), "weight": "10%", "details": format_details},
        },
        "feedback": feedback,
        "pass": total_score >= 85,
    }


def _generate_feedback(
    total, kw_score, skill_score, tfidf_score,
    section_score, format_score,
    kw_details, skill_details, section_details, format_details,
) -> List[str]:
    """Generate actionable feedback based on scores."""
    feedback = []

    if total >= 85:
        feedback.append("🎉 Excellent ATS compatibility! This CV should pass most ATS systems.")
    elif total >= 70:
        feedback.append("👍 Good ATS compatibility. A few improvements recommended.")
    else:
        feedback.append("⚠️ Needs improvement for ATS compatibility.")

    # Keyword feedback
    missing = kw_details.get("required_missing", [])
    if missing:
        feedback.append(f"📝 Add these missing keywords: {', '.join(missing[:8])}")

    # Skill feedback
    missing_skills = skill_details.get("missing", [])
    if missing_skills:
        feedback.append(f"🔧 Missing skills to add: {', '.join(missing_skills[:5])}")

    # Section feedback
    missing_sections = section_details.get("required_missing", [])
    if missing_sections:
        feedback.append(f"📋 Add these sections: {', '.join(s.title() for s in missing_sections)}")

    # Format feedback
    failed_formats = format_details.get("failed", [])
    for f in failed_formats:
        feedback.append(f)

    # TF-IDF feedback
    if tfidf_score < 30:
        feedback.append("📊 Low semantic similarity — mirror more language from the job description.")

    return feedback


def format_ats_report(report: Dict) -> str:
    """Format ATS report into a human-readable string."""
    lines = []

    score = report["total_score"]
    if score >= 85:
        emoji = "🟢"
    elif score >= 70:
        emoji = "🟡"
    else:
        emoji = "🔴"

    lines.append(f"{'═' * 50}")
    lines.append(f"  {emoji} ATS COMPATIBILITY SCORE: {score}/100")
    lines.append(f"  {'PASS ✅' if report['pass'] else 'NEEDS IMPROVEMENT ⚠️'}")
    lines.append(f"{'═' * 50}\n")

    # Breakdown
    lines.append("📊 SCORE BREAKDOWN:")
    for name, data in report["breakdown"].items():
        bar_len = int(data["score"] / 5)
        bar = "█" * bar_len + "░" * (20 - bar_len)
        display_name = name.replace("_", " ").title()
        lines.append(f"  {bar} {data['score']:5.1f}  {display_name} ({data['weight']})")
    lines.append("")

    # Detailed feedback
    lines.append("💡 FEEDBACK:")
    for fb in report["feedback"]:
        lines.append(f"  {fb}")
    lines.append("")

    # Keyword details
    kw = report["breakdown"]["keyword_match"]["details"]
    if kw.get("required_missing"):
        lines.append(f"  🔑 Missing Keywords: {', '.join(kw['required_missing'][:10])}")
    if kw.get("required_matched"):
        matched = [m for m in kw["required_matched"] if "(partial)" not in m]
        if matched:
            lines.append(f"  ✅ Matched Keywords: {', '.join(matched[:10])}")

    return "\n".join(lines)