scinote-web/app/services/reports/html_to_word_converter.rb

# frozen_string_literal: true

module Reports
  class HtmlToWordConverter
    def initialize(document, options = {})
      @docx = document
      @scinote_url = options[:scinote_url]
      @link_style = options[:link_style]
    end

    def html_to_word_converter(text)
      html = Nokogiri::HTML(text)
      raw_elements = recursive_children(html.css('body').children, []).compact

      # Combined raw text blocks in paragraphs
      elements = combine_docx_elements(raw_elements)

      # Draw elements
      elements.each do |elem|
        if elem[:type] == 'p'
          Reports::DocxRenderer.render_p_element(@docx, elem, scinote_url: @scinote_url, link_style: @link_style)
        elsif elem[:type] == 'table'
          Reports::DocxRenderer.render_table_element(@docx, elem)
        elsif elem[:type] == 'newline'
          style = elem[:style] || {}
          # print heading if its heading
          # Mixing heading with other style setting causes problems for Word
          if %w(h1 h2 h3 h4 h5 h6).include?(style[:style])
            @docx.public_send(style[:style], elem[:value])
          else
            @docx.p elem[:value] do
              align style[:align]
              color style[:color]
              bold style[:bold]
              italic style[:italic]
            end
          end
        elsif elem[:type] == 'image'
          Reports::DocxRenderer.render_img_element(@docx, elem)
        elsif %w(ul ol).include?(elem[:type])
          Reports::DocxRenderer.render_list_element(@docx, elem)
        end
      end
    end

    private

    def combine_docx_elements(raw_elements)
      # Word does not support some nested elements, move some elements to root level
      elements = []
      temp_p = []
      raw_elements.each do |elem|
        next unless elem

        if %w(image newline table ol ul).include? elem[:type]
          unless temp_p.blank?
            elements.push(type: 'p', children: temp_p)
            temp_p = []
          end
          elements.push(elem)
        elsif %w(br text a sup sub).include? elem[:type]
          temp_p.push(elem)
        end
      end
      elements.push(type: 'p', children: temp_p)
      elements
    end

    # Convert HTML structure to plain text structure
    # rubocop:disable Metrics/BlockLength
    def recursive_children(children, elements, skip_newline = false)
      children.each do |elem|
        if elem.class == Nokogiri::XML::Text
          next if elem.text.strip == ' ' # Invisible symbol

          style = paragraph_styling(elem.parent)
          type = !skip_newline && ((style[:align] && style[:align] != :justify) || style[:style]) ? 'newline' : 'text'

          text = smart_annotation_check(elem)

          elements.push(
            type: type,
            value: text.strip.delete(' '), # Invisible symbol
            style: style
          )
          next
        end

        if elem.name == 'br'
          elements.push(type: 'br')
          next
        end

        if elem.name == 'img'
          elements.push(img_element(elem))
          next
        end

        if elem.name == 'a'
          elements.push(link_element(elem))
          next
        end

        if elem.name == 'table'
          elements.push(tiny_mce_table_element(elem))
          next
        end

        if %w(sup sub).include?(elem.name)
          elements.push(text_formatting_element(elem))
          next
        end

        if %w(ul ol).include?(elem.name)
          elements.push(list_element(elem))
          next
        end
        elements = recursive_children(elem.children, elements, skip_newline) if elem.children
      end
      elements
    end

    # rubocop:enable Metrics/BlockLength

    def img_element(elem)
      return unless elem.attributes['data-mce-token']

      image = TinyMceAsset.find_by(id: Base62.decode(elem.attributes['data-mce-token'].value))
      return unless image

      image_path = Reports::Utils.image_prepare(image).processed.url

      dimension = FastImage.size(image_path)

      return unless dimension

      style = image_styling(elem, dimension)

      { type: 'image', data: image_path.split('&')[0], blob: image.blob, style: style }
    end

    def link_element(elem)
      text = elem.text
      link = elem.attributes['href'].value if elem.attributes['href']
      if elem.attributes['class']&.value == 'record-info-link'
        link = nil
        text = "##{text}"
      end
      text = "##{text}" if elem.parent.attributes['class']&.value == 'atwho-inserted'
      text = "@#{text}" if elem.attributes['class']&.value == 'atwho-user-popover'
      {
        type: 'a',
        value: text,
        link: link
      }
    end

    def list_element(list_element)
      allowed_elements = %w(li ul ol a img strong em h1 h2 h2 h3 h4 h5 span p)
      data_array = list_element.children.select { |n| allowed_elements.include?(n.name) }.map do |li_child|
        li_child.children.map do |item|
          if item.is_a? Nokogiri::XML::Text
            item.text.chomp
          elsif %w(ul ol).include?(item.name)
            list_element(item)
          elsif %w(a).include?(item.name)
            link_element(item)
          elsif %w(img).include?(item.name)
            img_element(item)&.merge(bookmark_id: SecureRandom.hex)
          elsif %w(table).include?(item.name)
            tiny_mce_table_element(item).merge(bookmark_id: SecureRandom.hex)
          elsif %w(strong em h1 h2 h2 h3 h4 h5 span p).include?(item.name)
            # Pass styles and extend renderer for li with style, some limitations on li items
            # { type: 'text', value: item[:value], style: paragraph_styling(item) }
            item.children.text
          end
        end.reject(&:blank?)
      end
      { type: list_element.name, data: data_array }
    end

    def smart_annotation_check(elem)
      return "[#{elem.text}]" if elem.parent.attributes['class']&.value == 'sa-type'

      elem.text
    end

    # Prepare style for text
    def paragraph_styling(elem)
      style = elem.attributes['style']
      result = {}
      result[:style] = elem.name if elem.name.include? 'h'
      result[:bold] = true if elem.name == 'strong'
      result[:italic] = true if elem.name == 'em'
      style_keys = %w(text-align color text-decoration)

      if style
        style_keys.each do |key|
          style_el = style.value.split(';').select { |i| (i.include? key) }[0]
          next unless style_el

          value = style_el.split(':')[1].strip if style_el

          if key == 'text-align'
            result[:align] = value.to_sym
          elsif key == 'color' && Reports::Utils.calculate_color_hsp("##{normalized_hex_color(value)}") < 190
            result[:color] = normalized_hex_color(value)
          elsif key == 'text-decoration' && value == 'underline'
            result[:underline] = true
          end
        end
      end
      result
    end

    # Prepare style for images
    def image_styling(elem, dimension)
      dimension[0] = elem.attributes['width'].value.to_i if elem.attributes['width']
      dimension[1] = elem.attributes['height'].value.to_i if elem.attributes['height']

      if elem.attributes['style']
        align = if elem.attributes['style'].value.include? 'margin-right'
                  :center
                elsif elem.attributes['style'].value.include? 'float: right'
                  :right
                else
                  :left
                end
      end

      margins = Constants::REPORT_DOCX_MARGIN_LEFT + Constants::REPORT_DOCX_MARGIN_RIGHT
      max_width = (Constants::REPORT_DOCX_WIDTH - margins) / 20

      if dimension[0] > max_width
        x = max_width
        y = dimension[1] * max_width / dimension[0]
      else
        x = dimension[0]
        y = dimension[1]
      end

      {
        width: x,
        height: y,
        align: align,
        max_width: max_width
      }
    end

    def tiny_mce_table_element(table_element)
      # array of elements
      rows = table_element.css('tbody').first.children.map do |row|
        next unless row.name == 'tr'

        cells = row.children.map do |cell|
          next unless cell.name == 'td'

          # Parse cell content
          formated_cell = recursive_children(cell.children, [], true)

          # Combine text elements to single paragraph
          formated_cell = combine_docx_elements(formated_cell)
          formated_cell
        end.reject(&:blank?)
        { type: 'tr', data: cells }
      end.reject(&:blank?)
      { type: 'table', data: rows }
    end

    def text_formatting_element(element)
      { type: element.name, value: element.text }
    end

    def normalized_hex_color(color)
      return color.delete('#') if color.start_with?('#')

      color.scan(/\d+/).map(&:to_i).map { |c| c.to_s(16).rjust(2, '0').upcase }.join
    end
  end
end