aloysia98 commited on
Commit
3a051f0
·
verified ·
1 Parent(s): d26caae

FIX: sanitize unicode chars (en-dash, smart quotes, bullets) before PDF rendering

Browse files
Files changed (1) hide show
  1. docx_generator.py +56 -41
docx_generator.py CHANGED
@@ -11,7 +11,6 @@ from fpdf import FPDF
11
 
12
  logger = logging.getLogger(__name__)
13
 
14
- # Page dimensions (Letter: 215.9 x 279.4 mm)
15
  PAGE_W = 215.9
16
  PAGE_H = 279.4
17
  MARGIN_L = 15
@@ -19,11 +18,51 @@ MARGIN_R = 15
19
  MARGIN_T = 12
20
  MARGIN_B = 12
21
  USABLE_W = PAGE_W - MARGIN_L - MARGIN_R
22
- USABLE_H = PAGE_H - MARGIN_T - MARGIN_B
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  def _parse_cv_text(cv_text: str) -> dict:
26
  """Parse CV text into name, contact, and ordered sections."""
 
27
  lines = cv_text.strip().split('\n')
28
 
29
  section_keywords = [
@@ -37,7 +76,6 @@ def _parse_cv_text(cv_text: str) -> dict:
37
  'languages', 'volunteer', 'publications', 'research',
38
  ]
39
 
40
- # Ordered list of (header, content_lines)
41
  sections = []
42
  current_header = None
43
  current_lines = []
@@ -79,10 +117,10 @@ def _parse_cv_text(cv_text: str) -> dict:
79
  continue
80
 
81
  if i < 4 and not current_header:
82
- if not name_line and not re.search(r'[@|•·]|phone|email|linkedin', stripped, re.IGNORECASE):
83
  name_line = stripped
84
  continue
85
- if re.search(r'[@|•·]|phone|email|linkedin|github|\d{3}[-.\s]?\d{3}', stripped, re.IGNORECASE):
86
  contact_lines.append(stripped)
87
  continue
88
 
@@ -104,8 +142,6 @@ def _parse_cv_text(cv_text: str) -> dict:
104
 
105
 
106
  class ResumePDF(FPDF):
107
- """Custom PDF class for resume generation."""
108
-
109
  def __init__(self, font_sizes=None):
110
  super().__init__()
111
  self.font_sizes = font_sizes or {
@@ -116,7 +152,7 @@ class ResumePDF(FPDF):
116
  if not name:
117
  return
118
  self.set_font('Helvetica', 'B', self.font_sizes['name'])
119
- self.set_text_color(31, 73, 125) # Dark blue
120
  self.cell(USABLE_W, self.font_sizes['name'] * 0.5, name, align='C', new_x="LMARGIN", new_y="NEXT")
121
  self.ln(1)
122
 
@@ -133,7 +169,6 @@ class ResumePDF(FPDF):
133
  self.set_font('Helvetica', 'B', self.font_sizes['header'])
134
  self.set_text_color(31, 73, 125)
135
  self.cell(USABLE_W, self.font_sizes['header'] * 0.45, title.upper(), new_x="LMARGIN", new_y="NEXT")
136
- # Thin line under header
137
  y = self.get_y()
138
  self.set_draw_color(180, 180, 180)
139
  self.line(MARGIN_L, y, PAGE_W - MARGIN_R, y)
@@ -148,10 +183,9 @@ class ResumePDF(FPDF):
148
  def _add_bullet(self, text):
149
  self.set_font('Helvetica', '', self.font_sizes['bullet'])
150
  self.set_text_color(50, 50, 50)
151
- bullet_indent = 4
152
- self.set_x(MARGIN_L + bullet_indent)
153
- # Use a simple dash as bullet for maximum ATS compatibility
154
- self.multi_cell(USABLE_W - bullet_indent, self.font_sizes['bullet'] * 0.42, f"- {text}")
155
  self.ln(0.3)
156
 
157
  def _add_subheader(self, text):
@@ -161,16 +195,12 @@ class ResumePDF(FPDF):
161
  self.ln(0.3)
162
 
163
  def _add_section_content(self, content):
164
- lines = content.split('\n')
165
- for line in lines:
166
  stripped = line.strip()
167
  if not stripped:
168
  continue
169
- # Bullet point
170
- if re.match(r'^[\-\*]\s', stripped):
171
- bullet_text = re.sub(r'^[•\-\*]\s*', '', stripped)
172
- self._add_bullet(bullet_text)
173
- # Sub-header (Job Title | Company | Date)
174
  elif re.match(r'^[A-Z].*\|', stripped) or (stripped.endswith(':') and len(stripped) < 80):
175
  self._add_subheader(stripped.rstrip(':'))
176
  else:
@@ -178,7 +208,6 @@ class ResumePDF(FPDF):
178
 
179
 
180
  def _build_pdf(parsed, font_sizes):
181
- """Build PDF with given font sizes. Returns (pdf_bytes, page_count)."""
182
  pdf = ResumePDF(font_sizes=font_sizes)
183
  pdf.add_page()
184
  pdf.set_margins(MARGIN_L, MARGIN_T, MARGIN_R)
@@ -196,44 +225,30 @@ def _build_pdf(parsed, font_sizes):
196
 
197
 
198
  def generate_pdf(cv_text: str) -> bytes:
199
- """
200
- Generate a 1-page ATS-friendly PDF from CV text.
201
- Auto-shrinks fonts if content overflows 1 page.
202
- """
203
  parsed = _parse_cv_text(cv_text)
204
-
205
- # Start with standard sizes, shrink if needed
206
  base_sizes = {'name': 16, 'contact': 8, 'header': 10, 'body': 9.5, 'bullet': 9.5}
207
 
208
- # Try progressively smaller fonts until it fits 1 page
209
  for shrink_step in range(12):
210
- factor = 1.0 - (shrink_step * 0.05) # 1.0, 0.95, 0.90, ... 0.45
211
  sizes = {k: max(v * factor, 5.5) for k, v in base_sizes.items()}
212
-
213
  pdf_bytes, page_count = _build_pdf(parsed, sizes)
214
-
215
  if page_count <= 1:
216
- logger.info(f"PDF fits 1 page (font factor={factor:.2f}, body={sizes['body']:.1f}pt)")
217
  return pdf_bytes
218
 
219
- # Last resort: already at minimum fonts
220
- logger.warning("PDF couldn't fit 1 page even at minimum font size")
221
  return pdf_bytes
222
 
223
 
224
  def save_pdf_to_file(cv_text: str, output_path: str = None) -> str:
225
- """Save PDF to file, return path."""
226
  if output_path is None:
227
  output_path = tempfile.mktemp(suffix='.pdf', prefix='ats_cv_')
228
-
229
- pdf_bytes = generate_pdf(cv_text)
230
  with open(output_path, 'wb') as f:
231
- f.write(pdf_bytes)
232
-
233
  return output_path
234
 
235
 
236
- # Keep old names for backward compatibility
237
  def save_docx_to_file(cv_text: str, output_path: str = None) -> str:
238
- """Backward compat — now generates PDF instead of DOCX."""
239
  return save_pdf_to_file(cv_text, output_path)
 
11
 
12
  logger = logging.getLogger(__name__)
13
 
 
14
  PAGE_W = 215.9
15
  PAGE_H = 279.4
16
  MARGIN_L = 15
 
18
  MARGIN_T = 12
19
  MARGIN_B = 12
20
  USABLE_W = PAGE_W - MARGIN_L - MARGIN_R
21
+
22
+
23
+ def _sanitize_text(text: str) -> str:
24
+ """Replace unicode characters with ASCII equivalents for PDF compatibility."""
25
+ replacements = {
26
+ '\u2013': '-', # en-dash
27
+ '\u2014': '-', # em-dash
28
+ '\u2018': "'", # left single quote
29
+ '\u2019': "'", # right single quote
30
+ '\u201c': '"', # left double quote
31
+ '\u201d': '"', # right double quote
32
+ '\u2022': '-', # bullet
33
+ '\u2023': '-', # triangular bullet
34
+ '\u2027': '-', # hyphenation point
35
+ '\u2043': '-', # hyphen bullet
36
+ '\u25aa': '-', # small black square
37
+ '\u25cf': '-', # black circle
38
+ '\u25cb': '-', # white circle
39
+ '\u25e6': '-', # white bullet
40
+ '\u2026': '...', # ellipsis
41
+ '\u00b7': '-', # middle dot
42
+ '\u2219': '-', # bullet operator
43
+ '\u00a0': ' ', # non-breaking space
44
+ '\u200b': '', # zero-width space
45
+ '\u200e': '', # left-to-right mark
46
+ '\u200f': '', # right-to-left mark
47
+ '\ufeff': '', # BOM
48
+ '\u00e9': 'e', # e-acute
49
+ '\u00e8': 'e', # e-grave
50
+ '\u00f1': 'n', # n-tilde
51
+ '\u00fc': 'u', # u-umlaut
52
+ '\u00e4': 'a', # a-umlaut
53
+ '\u00f6': 'o', # o-umlaut
54
+ '\u00df': 'ss', # eszett
55
+ }
56
+ for orig, repl in replacements.items():
57
+ text = text.replace(orig, repl)
58
+ # Remove any remaining non-ASCII that would break the font
59
+ text = text.encode('ascii', 'replace').decode('ascii')
60
+ return text
61
 
62
 
63
  def _parse_cv_text(cv_text: str) -> dict:
64
  """Parse CV text into name, contact, and ordered sections."""
65
+ cv_text = _sanitize_text(cv_text)
66
  lines = cv_text.strip().split('\n')
67
 
68
  section_keywords = [
 
76
  'languages', 'volunteer', 'publications', 'research',
77
  ]
78
 
 
79
  sections = []
80
  current_header = None
81
  current_lines = []
 
117
  continue
118
 
119
  if i < 4 and not current_header:
120
+ if not name_line and not re.search(r'[@|•\-·]|phone|email|linkedin', stripped, re.IGNORECASE):
121
  name_line = stripped
122
  continue
123
+ if re.search(r'[@|•\-·]|phone|email|linkedin|github|\d{3}[-.\s]?\d{3}', stripped, re.IGNORECASE):
124
  contact_lines.append(stripped)
125
  continue
126
 
 
142
 
143
 
144
  class ResumePDF(FPDF):
 
 
145
  def __init__(self, font_sizes=None):
146
  super().__init__()
147
  self.font_sizes = font_sizes or {
 
152
  if not name:
153
  return
154
  self.set_font('Helvetica', 'B', self.font_sizes['name'])
155
+ self.set_text_color(31, 73, 125)
156
  self.cell(USABLE_W, self.font_sizes['name'] * 0.5, name, align='C', new_x="LMARGIN", new_y="NEXT")
157
  self.ln(1)
158
 
 
169
  self.set_font('Helvetica', 'B', self.font_sizes['header'])
170
  self.set_text_color(31, 73, 125)
171
  self.cell(USABLE_W, self.font_sizes['header'] * 0.45, title.upper(), new_x="LMARGIN", new_y="NEXT")
 
172
  y = self.get_y()
173
  self.set_draw_color(180, 180, 180)
174
  self.line(MARGIN_L, y, PAGE_W - MARGIN_R, y)
 
183
  def _add_bullet(self, text):
184
  self.set_font('Helvetica', '', self.font_sizes['bullet'])
185
  self.set_text_color(50, 50, 50)
186
+ indent = 4
187
+ self.set_x(MARGIN_L + indent)
188
+ self.multi_cell(USABLE_W - indent, self.font_sizes['bullet'] * 0.42, f"- {text}")
 
189
  self.ln(0.3)
190
 
191
  def _add_subheader(self, text):
 
195
  self.ln(0.3)
196
 
197
  def _add_section_content(self, content):
198
+ for line in content.split('\n'):
 
199
  stripped = line.strip()
200
  if not stripped:
201
  continue
202
+ if re.match(r'^[\-\*]\s', stripped):
203
+ self._add_bullet(re.sub(r'^[\-\*]\s*', '', stripped))
 
 
 
204
  elif re.match(r'^[A-Z].*\|', stripped) or (stripped.endswith(':') and len(stripped) < 80):
205
  self._add_subheader(stripped.rstrip(':'))
206
  else:
 
208
 
209
 
210
  def _build_pdf(parsed, font_sizes):
 
211
  pdf = ResumePDF(font_sizes=font_sizes)
212
  pdf.add_page()
213
  pdf.set_margins(MARGIN_L, MARGIN_T, MARGIN_R)
 
225
 
226
 
227
  def generate_pdf(cv_text: str) -> bytes:
228
+ """Generate 1-page ATS PDF. Auto-shrinks fonts if needed."""
 
 
 
229
  parsed = _parse_cv_text(cv_text)
 
 
230
  base_sizes = {'name': 16, 'contact': 8, 'header': 10, 'body': 9.5, 'bullet': 9.5}
231
 
 
232
  for shrink_step in range(12):
233
+ factor = 1.0 - (shrink_step * 0.05)
234
  sizes = {k: max(v * factor, 5.5) for k, v in base_sizes.items()}
 
235
  pdf_bytes, page_count = _build_pdf(parsed, sizes)
 
236
  if page_count <= 1:
237
+ logger.info(f"PDF: 1 page (factor={factor:.2f}, body={sizes['body']:.1f}pt)")
238
  return pdf_bytes
239
 
240
+ logger.warning("PDF: couldn't fit 1 page at min font")
 
241
  return pdf_bytes
242
 
243
 
244
  def save_pdf_to_file(cv_text: str, output_path: str = None) -> str:
 
245
  if output_path is None:
246
  output_path = tempfile.mktemp(suffix='.pdf', prefix='ats_cv_')
 
 
247
  with open(output_path, 'wb') as f:
248
+ f.write(generate_pdf(cv_text))
 
249
  return output_path
250
 
251
 
252
+ # Backward compat
253
  def save_docx_to_file(cv_text: str, output_path: str = None) -> str:
 
254
  return save_pdf_to_file(cv_text, output_path)