Spaces:

aloysia98
/

ats-proof-cv-generator

Sleeping

App Files Files Community

aloysia98 commited on Apr 25

Commit

3a051f0

verified ·

1 Parent(s): d26caae

FIX: sanitize unicode chars (en-dash, smart quotes, bullets) before PDF rendering

Browse files

Files changed (1) hide show

docx_generator.py +56 -41

docx_generator.py CHANGED Viewed

@@ -11,7 +11,6 @@ from fpdf import FPDF
 logger = logging.getLogger(__name__)
-# Page dimensions (Letter: 215.9 x 279.4 mm)
 PAGE_W = 215.9
 PAGE_H = 279.4
 MARGIN_L = 15
@@ -19,11 +18,51 @@ MARGIN_R = 15
 MARGIN_T = 12
 MARGIN_B = 12
 USABLE_W = PAGE_W - MARGIN_L - MARGIN_R
-USABLE_H = PAGE_H - MARGIN_T - MARGIN_B
 def _parse_cv_text(cv_text: str) -> dict:
     """Parse CV text into name, contact, and ordered sections."""
     lines = cv_text.strip().split('\n')
     section_keywords = [
@@ -37,7 +76,6 @@ def _parse_cv_text(cv_text: str) -> dict:
         'languages', 'volunteer', 'publications', 'research',
     ]
-    # Ordered list of (header, content_lines)
     sections = []
     current_header = None
     current_lines = []
@@ -79,10 +117,10 @@ def _parse_cv_text(cv_text: str) -> dict:
                 continue
         if i < 4 and not current_header:
-            if not name_line and not re.search(r'[@|•·]|phone|email|linkedin', stripped, re.IGNORECASE):
                 name_line = stripped
                 continue
-            if re.search(r'[@|•·]|phone|email|linkedin|github|\d{3}[-.\s]?\d{3}', stripped, re.IGNORECASE):
                 contact_lines.append(stripped)
                 continue
@@ -104,8 +142,6 @@ def _parse_cv_text(cv_text: str) -> dict:
 class ResumePDF(FPDF):
-    """Custom PDF class for resume generation."""
     def __init__(self, font_sizes=None):
         super().__init__()
         self.font_sizes = font_sizes or {
@@ -116,7 +152,7 @@ class ResumePDF(FPDF):
         if not name:
             return
         self.set_font('Helvetica', 'B', self.font_sizes['name'])
-        self.set_text_color(31, 73, 125)  # Dark blue
         self.cell(USABLE_W, self.font_sizes['name'] * 0.5, name, align='C', new_x="LMARGIN", new_y="NEXT")
         self.ln(1)
@@ -133,7 +169,6 @@ class ResumePDF(FPDF):
         self.set_font('Helvetica', 'B', self.font_sizes['header'])
         self.set_text_color(31, 73, 125)
         self.cell(USABLE_W, self.font_sizes['header'] * 0.45, title.upper(), new_x="LMARGIN", new_y="NEXT")
-        # Thin line under header
         y = self.get_y()
         self.set_draw_color(180, 180, 180)
         self.line(MARGIN_L, y, PAGE_W - MARGIN_R, y)
@@ -148,10 +183,9 @@ class ResumePDF(FPDF):
     def _add_bullet(self, text):
         self.set_font('Helvetica', '', self.font_sizes['bullet'])
         self.set_text_color(50, 50, 50)
-        bullet_indent = 4
-        self.set_x(MARGIN_L + bullet_indent)
-        # Use a simple dash as bullet for maximum ATS compatibility
-        self.multi_cell(USABLE_W - bullet_indent, self.font_sizes['bullet'] * 0.42, f"- {text}")
         self.ln(0.3)
     def _add_subheader(self, text):
@@ -161,16 +195,12 @@ class ResumePDF(FPDF):
         self.ln(0.3)
     def _add_section_content(self, content):
-        lines = content.split('\n')
-        for line in lines:
             stripped = line.strip()
             if not stripped:
                 continue
-            # Bullet point
-            if re.match(r'^[•\-\*]\s', stripped):
-                bullet_text = re.sub(r'^[•\-\*]\s*', '', stripped)
-                self._add_bullet(bullet_text)
-            # Sub-header (Job Title | Company | Date)
             elif re.match(r'^[A-Z].*\|', stripped) or (stripped.endswith(':') and len(stripped) < 80):
                 self._add_subheader(stripped.rstrip(':'))
             else:
@@ -178,7 +208,6 @@ class ResumePDF(FPDF):
 def _build_pdf(parsed, font_sizes):
-    """Build PDF with given font sizes. Returns (pdf_bytes, page_count)."""
     pdf = ResumePDF(font_sizes=font_sizes)
     pdf.add_page()
     pdf.set_margins(MARGIN_L, MARGIN_T, MARGIN_R)
@@ -196,44 +225,30 @@ def _build_pdf(parsed, font_sizes):
 def generate_pdf(cv_text: str) -> bytes:
-    """
-    Generate a 1-page ATS-friendly PDF from CV text.
-    Auto-shrinks fonts if content overflows 1 page.
-    """
     parsed = _parse_cv_text(cv_text)
-    # Start with standard sizes, shrink if needed
     base_sizes = {'name': 16, 'contact': 8, 'header': 10, 'body': 9.5, 'bullet': 9.5}
-    # Try progressively smaller fonts until it fits 1 page
     for shrink_step in range(12):
-        factor = 1.0 - (shrink_step * 0.05)  # 1.0, 0.95, 0.90, ... 0.45
         sizes = {k: max(v * factor, 5.5) for k, v in base_sizes.items()}
         pdf_bytes, page_count = _build_pdf(parsed, sizes)
         if page_count <= 1:
-            logger.info(f"PDF fits 1 page (font factor={factor:.2f}, body={sizes['body']:.1f}pt)")
             return pdf_bytes
-    # Last resort: already at minimum fonts
-    logger.warning("PDF couldn't fit 1 page even at minimum font size")
     return pdf_bytes
 def save_pdf_to_file(cv_text: str, output_path: str = None) -> str:
-    """Save PDF to file, return path."""
     if output_path is None:
         output_path = tempfile.mktemp(suffix='.pdf', prefix='ats_cv_')
-    pdf_bytes = generate_pdf(cv_text)
     with open(output_path, 'wb') as f:
-        f.write(pdf_bytes)
     return output_path
-# Keep old names for backward compatibility
 def save_docx_to_file(cv_text: str, output_path: str = None) -> str:
-    """Backward compat — now generates PDF instead of DOCX."""
     return save_pdf_to_file(cv_text, output_path)

 logger = logging.getLogger(__name__)
 PAGE_W = 215.9
 PAGE_H = 279.4
 MARGIN_L = 15
 MARGIN_T = 12
 MARGIN_B = 12
 USABLE_W = PAGE_W - MARGIN_L - MARGIN_R
+def _sanitize_text(text: str) -> str:
+    """Replace unicode characters with ASCII equivalents for PDF compatibility."""
+    replacements = {
+        '\u2013': '-',   # en-dash
+        '\u2014': '-',   # em-dash
+        '\u2018': "'",   # left single quote
+        '\u2019': "'",   # right single quote
+        '\u201c': '"',   # left double quote
+        '\u201d': '"',   # right double quote
+        '\u2022': '-',   # bullet
+        '\u2023': '-',   # triangular bullet
+        '\u2027': '-',   # hyphenation point
+        '\u2043': '-',   # hyphen bullet
+        '\u25aa': '-',   # small black square
+        '\u25cf': '-',   # black circle
+        '\u25cb': '-',   # white circle
+        '\u25e6': '-',   # white bullet
+        '\u2026': '...', # ellipsis
+        '\u00b7': '-',   # middle dot
+        '\u2219': '-',   # bullet operator
+        '\u00a0': ' ',   # non-breaking space
+        '\u200b': '',    # zero-width space
+        '\u200e': '',    # left-to-right mark
+        '\u200f': '',    # right-to-left mark
+        '\ufeff': '',    # BOM
+        '\u00e9': 'e',   # e-acute
+        '\u00e8': 'e',   # e-grave
+        '\u00f1': 'n',   # n-tilde
+        '\u00fc': 'u',   # u-umlaut
+        '\u00e4': 'a',   # a-umlaut
+        '\u00f6': 'o',   # o-umlaut
+        '\u00df': 'ss',  # eszett
+    }
+    for orig, repl in replacements.items():
+        text = text.replace(orig, repl)
+    # Remove any remaining non-ASCII that would break the font
+    text = text.encode('ascii', 'replace').decode('ascii')
+    return text
 def _parse_cv_text(cv_text: str) -> dict:
     """Parse CV text into name, contact, and ordered sections."""
+    cv_text = _sanitize_text(cv_text)
     lines = cv_text.strip().split('\n')
     section_keywords = [
         'languages', 'volunteer', 'publications', 'research',
     ]
     sections = []
     current_header = None
     current_lines = []
                 continue
         if i < 4 and not current_header:
+            if not name_line and not re.search(r'[@|•\-·]|phone|email|linkedin', stripped, re.IGNORECASE):
                 name_line = stripped
                 continue
+            if re.search(r'[@|•\-·]|phone|email|linkedin|github|\d{3}[-.\s]?\d{3}', stripped, re.IGNORECASE):
                 contact_lines.append(stripped)
                 continue
 class ResumePDF(FPDF):
     def __init__(self, font_sizes=None):
         super().__init__()
         self.font_sizes = font_sizes or {
         if not name:
             return
         self.set_font('Helvetica', 'B', self.font_sizes['name'])
+        self.set_text_color(31, 73, 125)
         self.cell(USABLE_W, self.font_sizes['name'] * 0.5, name, align='C', new_x="LMARGIN", new_y="NEXT")
         self.ln(1)
         self.set_font('Helvetica', 'B', self.font_sizes['header'])
         self.set_text_color(31, 73, 125)
         self.cell(USABLE_W, self.font_sizes['header'] * 0.45, title.upper(), new_x="LMARGIN", new_y="NEXT")
         y = self.get_y()
         self.set_draw_color(180, 180, 180)
         self.line(MARGIN_L, y, PAGE_W - MARGIN_R, y)
     def _add_bullet(self, text):
         self.set_font('Helvetica', '', self.font_sizes['bullet'])
         self.set_text_color(50, 50, 50)
+        indent = 4
+        self.set_x(MARGIN_L + indent)
+        self.multi_cell(USABLE_W - indent, self.font_sizes['bullet'] * 0.42, f"- {text}")
         self.ln(0.3)
     def _add_subheader(self, text):
         self.ln(0.3)
     def _add_section_content(self, content):
+        for line in content.split('\n'):
             stripped = line.strip()
             if not stripped:
                 continue
+            if re.match(r'^[\-\*]\s', stripped):
+                self._add_bullet(re.sub(r'^[\-\*]\s*', '', stripped))
             elif re.match(r'^[A-Z].*\|', stripped) or (stripped.endswith(':') and len(stripped) < 80):
                 self._add_subheader(stripped.rstrip(':'))
             else:
 def _build_pdf(parsed, font_sizes):
     pdf = ResumePDF(font_sizes=font_sizes)
     pdf.add_page()
     pdf.set_margins(MARGIN_L, MARGIN_T, MARGIN_R)
 def generate_pdf(cv_text: str) -> bytes:
+    """Generate 1-page ATS PDF. Auto-shrinks fonts if needed."""
     parsed = _parse_cv_text(cv_text)
     base_sizes = {'name': 16, 'contact': 8, 'header': 10, 'body': 9.5, 'bullet': 9.5}
     for shrink_step in range(12):
+        factor = 1.0 - (shrink_step * 0.05)
         sizes = {k: max(v * factor, 5.5) for k, v in base_sizes.items()}
         pdf_bytes, page_count = _build_pdf(parsed, sizes)
         if page_count <= 1:
+            logger.info(f"PDF: 1 page (factor={factor:.2f}, body={sizes['body']:.1f}pt)")
             return pdf_bytes
+    logger.warning("PDF: couldn't fit 1 page at min font")
     return pdf_bytes
 def save_pdf_to_file(cv_text: str, output_path: str = None) -> str:
     if output_path is None:
         output_path = tempfile.mktemp(suffix='.pdf', prefix='ats_cv_')
     with open(output_path, 'wb') as f:
+        f.write(generate_pdf(cv_text))
     return output_path
+# Backward compat
 def save_docx_to_file(cv_text: str, output_path: str = None) -> str:
     return save_pdf_to_file(cv_text, output_path)