Class: PrmGenerator::ContentSanitizer

Inherits:
Object
  • Object
show all
Extended by:
T::Sig
Defined in:
lib/udb/prm_generator.rb

Overview

Sanitizes content to fix HTML entities and other formatting issues

Class Method Summary collapse

Class Method Details

.sanitize(content) ⇒ Object



314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# File 'lib/udb/prm_generator.rb', line 314

def self.sanitize(content)
  return "" unless content.is_a?(String)

  # Fix HTML entities that cause AsciiDoctor parsing errors
  content = content.gsub(/≠/, '')
  content = content.gsub(/≥/, '')
  content = content.gsub(/≤/, '')
  content = content.gsub(/>/, '>')
  content = content.gsub(/&lt;/, '<')
  content = content.gsub(/&amp;/, '&')
  content = content.gsub(/&pm;/, '±')
  content = content.gsub(/&times;/, '×')
  content = content.gsub(/&divide;/, '÷')

  # Fix other common HTML entities
  content = content.gsub(/&nbsp;/, ' ')
  content = content.gsub(/&quot;/, '"')
  content = content.gsub(/&#([0-9]+);/) { [$1.to_i].pack('U*') }
  content = content.gsub(/&#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack('U*') }

  # Clean up problematic AsciiDoc constructs
  content = content.gsub(/\n\n\n+/, "\n\n")  # Remove excessive blank lines
  content = content.gsub(/\r\n/, "\n")       # Normalize line endings

  # Ensure UTF-8 encoding
  content.force_encoding('UTF-8')

  content
rescue StandardError => e
  puts "[WARN] Content sanitization failed: #{e.message}"
  content.to_s
end