""" Component 1: CV Parser Extracts text from PDF resumes and structures them into JSON sections. Uses pdfplumber for reliable text extraction. """ import pdfplumber import re import os import shutil import tempfile import logging logger = logging.getLogger(__name__) SECTION_HEADERS = [ "summary", "professional summary", "objective", "career objective", "profile", "about me", "about", "experience", "work experience", "professional experience", "employment history", "employment", "work history", "education", "academic background", "qualifications", "skills", "technical skills", "core competencies", "competencies", "key skills", "areas of expertise", "certifications", "certificates", "licenses", "projects", "key projects", "notable projects", "awards", "honors", "achievements", "publications", "research", "languages", "volunteer", "volunteering", "interests", "hobbies", "references", ] def extract_text_from_pdf(pdf_path: str) -> str: """Extract text from a PDF file using pdfplumber. Handles Gradio temp file paths that may get cleaned up.""" # If path doesn't exist, try common Gradio upload locations if not os.path.exists(pdf_path): logger.warning(f"PDF path not found: {pdf_path}") # Try the filename in /tmp/gradio/ basename = os.path.basename(pdf_path) alt_paths = [ f"/tmp/{basename}", os.path.join(tempfile.gettempdir(), basename), ] # Also search /tmp/gradio/ recursively gradio_tmp = "/tmp/gradio" if os.path.isdir(gradio_tmp): for root, dirs, files in os.walk(gradio_tmp): for f in files: if f == basename: alt_paths.insert(0, os.path.join(root, f)) found = False for alt in alt_paths: if os.path.exists(alt): pdf_path = alt found = True logger.info(f"Found PDF at alternate path: {alt}") break if not found: raise ValueError( f"PDF file not found. The uploaded file may have been cleaned up.\n" f"Please try uploading again, or paste your CV text directly." ) # Copy to a stable temp location before reading (prevents race conditions) stable_path = tempfile.mktemp(suffix='.pdf', prefix='cv_input_') try: shutil.copy2(pdf_path, stable_path) except Exception as e: logger.warning(f"Could not copy PDF: {e}, reading original") stable_path = pdf_path text_parts = [] try: with pdfplumber.open(stable_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text_parts.append(page_text) except Exception as e: raise ValueError(f"Failed to parse PDF: {str(e)}") finally: # Clean up our copy if stable_path != pdf_path and os.path.exists(stable_path): try: os.unlink(stable_path) except Exception: pass if not text_parts: raise ValueError("Could not extract any text from the PDF. Is it a scanned image?") return "\n".join(text_parts) def _normalize_header(header: str) -> str: header_lower = header.lower().strip() mapping = { "summary": "summary", "professional summary": "summary", "objective": "summary", "career objective": "summary", "profile": "summary", "about me": "summary", "about": "summary", "experience": "experience", "work experience": "experience", "professional experience": "experience", "employment history": "experience", "employment": "experience", "work history": "experience", "education": "education", "academic background": "education", "qualifications": "education", "skills": "skills", "technical skills": "skills", "core competencies": "skills", "competencies": "skills", "key skills": "skills", "areas of expertise": "skills", "certifications": "certifications", "certificates": "certifications", "licenses": "certifications", "projects": "projects", "key projects": "projects", "notable projects": "projects", "awards": "awards", "honors": "awards", "achievements": "awards", "publications": "publications", "research": "publications", "languages": "languages", "volunteer": "volunteer", "volunteering": "volunteer", "interests": "interests", "hobbies": "interests", "references": "references", } return mapping.get(header_lower, header_lower) def parse_cv_to_sections(raw_text: str) -> dict: lines = raw_text.strip().split("\n") if not lines: return {"name": "", "contact": "", "sections": {}, "raw_text": raw_text} name = "" contact_lines = [] header_start_idx = 0 for i, line in enumerate(lines[:5]): line = line.strip() if not line: continue if not name: if not re.search(r'[@|•·]|phone|email|linkedin|github|http', line, re.IGNORECASE): name = line header_start_idx = i + 1 continue if re.search(r'[@|•·]|phone|email|linkedin|github|http|\d{3}[-.\s]?\d{3}', line, re.IGNORECASE): contact_lines.append(line) header_start_idx = i + 1 contact = " | ".join(contact_lines) if contact_lines else "" alt_header_pattern = re.compile(r'^([A-Z][A-Z\s&/]{2,})\s*$') sections = {} current_section = "preamble" current_content = [] remaining_lines = lines[header_start_idx:] for line in remaining_lines: stripped = line.strip() if not stripped: current_content.append("") continue is_header = False header_name = "" for h in SECTION_HEADERS: if re.match(r'^' + re.escape(h) + r'\s*:?\s*$', stripped, re.IGNORECASE): is_header = True header_name = h break if not is_header and alt_header_pattern.match(stripped): normalized = stripped.lower().strip() if normalized in SECTION_HEADERS: is_header = True header_name = normalized if is_header: if current_content: content_text = "\n".join(current_content).strip() if content_text: sections[_normalize_header(current_section)] = content_text current_section = header_name current_content = [] else: current_content.append(line) if current_content: content_text = "\n".join(current_content).strip() if content_text: sections[_normalize_header(current_section)] = content_text return {"name": name, "contact": contact, "sections": sections, "raw_text": raw_text} def parse_cv(pdf_path: str) -> dict: """Full pipeline: PDF path -> structured CV dict.""" raw_text = extract_text_from_pdf(pdf_path) return parse_cv_to_sections(raw_text) def parse_cv_from_text(text: str) -> dict: """Parse from raw text (for when user pastes their CV).""" return parse_cv_to_sections(text)