"""
Component 1: CV Parser
Extracts text from PDF resumes and structures them into JSON sections.
Uses pdfplumber for reliable text extraction.
"""

import pdfplumber
import re
import os
import shutil
import tempfile
import logging

logger = logging.getLogger(__name__)

SECTION_HEADERS = [
    "summary", "professional summary", "objective", "career objective",
    "profile", "about me", "about",
    "experience", "work experience", "professional experience", "employment history",
    "employment", "work history",
    "education", "academic background", "qualifications",
    "skills", "technical skills", "core competencies", "competencies",
    "key skills", "areas of expertise",
    "certifications", "certificates", "licenses",
    "projects", "key projects", "notable projects",
    "awards", "honors", "achievements",
    "publications", "research",
    "languages", "volunteer", "volunteering", "interests", "hobbies",
    "references",
]


def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file using pdfplumber.
    Handles Gradio temp file paths that may get cleaned up."""
    
    # If path doesn't exist, try common Gradio upload locations
    if not os.path.exists(pdf_path):
        logger.warning(f"PDF path not found: {pdf_path}")
        # Try the filename in /tmp/gradio/
        basename = os.path.basename(pdf_path)
        alt_paths = [
            f"/tmp/{basename}",
            os.path.join(tempfile.gettempdir(), basename),
        ]
        # Also search /tmp/gradio/ recursively
        gradio_tmp = "/tmp/gradio"
        if os.path.isdir(gradio_tmp):
            for root, dirs, files in os.walk(gradio_tmp):
                for f in files:
                    if f == basename:
                        alt_paths.insert(0, os.path.join(root, f))
        
        found = False
        for alt in alt_paths:
            if os.path.exists(alt):
                pdf_path = alt
                found = True
                logger.info(f"Found PDF at alternate path: {alt}")
                break
        
        if not found:
            raise ValueError(
                f"PDF file not found. The uploaded file may have been cleaned up.\n"
                f"Please try uploading again, or paste your CV text directly."
            )
    
    # Copy to a stable temp location before reading (prevents race conditions)
    stable_path = tempfile.mktemp(suffix='.pdf', prefix='cv_input_')
    try:
        shutil.copy2(pdf_path, stable_path)
    except Exception as e:
        logger.warning(f"Could not copy PDF: {e}, reading original")
        stable_path = pdf_path
    
    text_parts = []
    try:
        with pdfplumber.open(stable_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text_parts.append(page_text)
    except Exception as e:
        raise ValueError(f"Failed to parse PDF: {str(e)}")
    finally:
        # Clean up our copy
        if stable_path != pdf_path and os.path.exists(stable_path):
            try:
                os.unlink(stable_path)
            except Exception:
                pass

    if not text_parts:
        raise ValueError("Could not extract any text from the PDF. Is it a scanned image?")

    return "\n".join(text_parts)


def _normalize_header(header: str) -> str:
    header_lower = header.lower().strip()
    mapping = {
        "summary": "summary", "professional summary": "summary",
        "objective": "summary", "career objective": "summary",
        "profile": "summary", "about me": "summary", "about": "summary",
        "experience": "experience", "work experience": "experience",
        "professional experience": "experience", "employment history": "experience",
        "employment": "experience", "work history": "experience",
        "education": "education", "academic background": "education",
        "qualifications": "education",
        "skills": "skills", "technical skills": "skills",
        "core competencies": "skills", "competencies": "skills",
        "key skills": "skills", "areas of expertise": "skills",
        "certifications": "certifications", "certificates": "certifications",
        "licenses": "certifications",
        "projects": "projects", "key projects": "projects",
        "notable projects": "projects",
        "awards": "awards", "honors": "awards", "achievements": "awards",
        "publications": "publications", "research": "publications",
        "languages": "languages",
        "volunteer": "volunteer", "volunteering": "volunteer",
        "interests": "interests", "hobbies": "interests",
        "references": "references",
    }
    return mapping.get(header_lower, header_lower)


def parse_cv_to_sections(raw_text: str) -> dict:
    lines = raw_text.strip().split("\n")
    if not lines:
        return {"name": "", "contact": "", "sections": {}, "raw_text": raw_text}

    name = ""
    contact_lines = []
    header_start_idx = 0

    for i, line in enumerate(lines[:5]):
        line = line.strip()
        if not line:
            continue
        if not name:
            if not re.search(r'[@|•·]|phone|email|linkedin|github|http', line, re.IGNORECASE):
                name = line
                header_start_idx = i + 1
                continue
        if re.search(r'[@|•·]|phone|email|linkedin|github|http|\d{3}[-.\s]?\d{3}', line, re.IGNORECASE):
            contact_lines.append(line)
            header_start_idx = i + 1

    contact = " | ".join(contact_lines) if contact_lines else ""

    alt_header_pattern = re.compile(r'^([A-Z][A-Z\s&/]{2,})\s*$')

    sections = {}
    current_section = "preamble"
    current_content = []
    remaining_lines = lines[header_start_idx:]

    for line in remaining_lines:
        stripped = line.strip()
        if not stripped:
            current_content.append("")
            continue

        is_header = False
        header_name = ""

        for h in SECTION_HEADERS:
            if re.match(r'^' + re.escape(h) + r'\s*:?\s*$', stripped, re.IGNORECASE):
                is_header = True
                header_name = h
                break

        if not is_header and alt_header_pattern.match(stripped):
            normalized = stripped.lower().strip()
            if normalized in SECTION_HEADERS:
                is_header = True
                header_name = normalized

        if is_header:
            if current_content:
                content_text = "\n".join(current_content).strip()
                if content_text:
                    sections[_normalize_header(current_section)] = content_text
            current_section = header_name
            current_content = []
        else:
            current_content.append(line)

    if current_content:
        content_text = "\n".join(current_content).strip()
        if content_text:
            sections[_normalize_header(current_section)] = content_text

    return {"name": name, "contact": contact, "sections": sections, "raw_text": raw_text}


def parse_cv(pdf_path: str) -> dict:
    """Full pipeline: PDF path -> structured CV dict."""
    raw_text = extract_text_from_pdf(pdf_path)
    return parse_cv_to_sections(raw_text)


def parse_cv_from_text(text: str) -> dict:
    """Parse from raw text (for when user pastes their CV)."""
    return parse_cv_to_sections(text)