scinote-web/app/services/reports/html_to_word_converter.rb

279 lines
8.6 KiB
Ruby
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# frozen_string_literal: true
module Reports
class HtmlToWordConverter
def initialize(document, options = {})
@docx = document
@scinote_url = options[:scinote_url]
@link_style = options[:link_style]
end
def html_to_word_converter(text)
html = Nokogiri::HTML(text)
raw_elements = recursive_children(html.css('body').children, []).compact
# Combined raw text blocks in paragraphs
elements = combine_docx_elements(raw_elements)
# Draw elements
elements.each do |elem|
if elem[:type] == 'p'
Reports::DocxRenderer.render_p_element(@docx, elem, scinote_url: @scinote_url, link_style: @link_style)
elsif elem[:type] == 'table'
Reports::DocxRenderer.render_table_element(@docx, elem)
elsif elem[:type] == 'newline'
style = elem[:style] || {}
# print heading if its heading
# Mixing heading with other style setting causes problems for Word
if %w(h1 h2 h3 h4 h5 h6).include?(style[:style])
@docx.public_send(style[:style], elem[:value])
else
@docx.p elem[:value] do
align style[:align]
color style[:color]
bold style[:bold]
italic style[:italic]
end
end
elsif elem[:type] == 'image'
Reports::DocxRenderer.render_img_element(@docx, elem)
elsif %w(ul ol).include?(elem[:type])
Reports::DocxRenderer.render_list_element(@docx, elem)
end
end
end
private
def combine_docx_elements(raw_elements)
# Word does not support some nested elements, move some elements to root level
elements = []
temp_p = []
raw_elements.each do |elem|
next unless elem
if %w(image newline table ol ul).include? elem[:type]
unless temp_p.blank?
elements.push(type: 'p', children: temp_p)
temp_p = []
end
elements.push(elem)
elsif %w(br text a sup sub).include? elem[:type]
temp_p.push(elem)
end
end
elements.push(type: 'p', children: temp_p)
elements
end
# Convert HTML structure to plain text structure
# rubocop:disable Metrics/BlockLength
def recursive_children(children, elements, skip_newline = false)
children.each do |elem|
if elem.class == Nokogiri::XML::Text
next if elem.text.strip == ' ' # Invisible symbol
style = paragraph_styling(elem.parent)
type = !skip_newline && ((style[:align] && style[:align] != :justify) || style[:style]) ? 'newline' : 'text'
text = smart_annotation_check(elem)
elements.push(
type: type,
value: text.strip.delete(' '), # Invisible symbol
style: style
)
next
end
if elem.name == 'br'
elements.push(type: 'br')
next
end
if elem.name == 'img'
elements.push(img_element(elem))
next
end
if elem.name == 'a'
elements.push(link_element(elem))
next
end
if elem.name == 'table'
elements.push(tiny_mce_table_element(elem))
next
end
if %w(sup sub).include?(elem.name)
elements.push(text_formatting_element(elem))
next
end
if %w(ul ol).include?(elem.name)
elements.push(list_element(elem))
next
end
elements = recursive_children(elem.children, elements, skip_newline) if elem.children
end
elements
end
# rubocop:enable Metrics/BlockLength
def img_element(elem)
return unless elem.attributes['data-mce-token']
image = TinyMceAsset.find_by(id: Base62.decode(elem.attributes['data-mce-token'].value))
return unless image
image_path = Reports::Utils.image_prepare(image).service_url
dimension = FastImage.size(image_path)
return unless dimension
style = image_styling(elem, dimension)
{ type: 'image', data: image_path.split('&')[0], blob: image.blob, style: style }
end
def link_element(elem)
text = elem.text
link = elem.attributes['href'].value if elem.attributes['href']
if elem.attributes['class']&.value == 'record-info-link'
link = nil
text = "##{text}"
end
text = "##{text}" if elem.parent.attributes['class']&.value == 'atwho-inserted'
text = "@#{text}" if elem.attributes['class']&.value == 'atwho-user-popover'
{
type: 'a',
value: text,
link: link
}
end
def list_element(list_element)
allowed_elements = %w(li ul ol a img strong em h1 h2 h2 h3 h4 h5 span p)
data_array = list_element.children.select { |n| allowed_elements.include?(n.name) }.map do |li_child|
li_child.children.map do |item|
if item.is_a? Nokogiri::XML::Text
item.text.chomp
elsif %w(ul ol).include?(item.name)
list_element(item)
elsif %w(a).include?(item.name)
link_element(item)
elsif %w(img).include?(item.name)
img_element(item)&.merge(bookmark_id: SecureRandom.hex)
elsif %w(table).include?(item.name)
tiny_mce_table_element(item).merge(bookmark_id: SecureRandom.hex)
elsif %w(strong em h1 h2 h2 h3 h4 h5 span p).include?(item.name)
# Pass styles and extend renderer for li with style, some limitations on li items
# { type: 'text', value: item[:value], style: paragraph_styling(item) }
item.children.text
end
end.reject(&:blank?)
end
{ type: list_element.name, data: data_array }
end
def smart_annotation_check(elem)
return "[#{elem.text}]" if elem.parent.attributes['class']&.value == 'sa-type'
elem.text
end
# Prepare style for text
def paragraph_styling(elem)
style = elem.attributes['style']
result = {}
result[:style] = elem.name if elem.name.include? 'h'
result[:bold] = true if elem.name == 'strong'
result[:italic] = true if elem.name == 'em'
style_keys = %w(text-align color text-decoration)
if style
style_keys.each do |key|
style_el = style.value.split(';').select { |i| (i.include? key) }[0]
next unless style_el
value = style_el.split(':')[1].strip if style_el
if key == 'text-align'
result[:align] = value.to_sym
elsif key == 'color' && Reports::Utils.calculate_color_hsp("##{normalized_hex_color(value)}") < 190
result[:color] = normalized_hex_color(value)
elsif key == 'text-decoration' && value == 'underline'
result[:underline] = true
end
end
end
result
end
# Prepare style for images
def image_styling(elem, dimension)
dimension[0] = elem.attributes['width'].value.to_i if elem.attributes['width']
dimension[1] = elem.attributes['height'].value.to_i if elem.attributes['height']
if elem.attributes['style']
align = if elem.attributes['style'].value.include? 'margin-right'
:center
elsif elem.attributes['style'].value.include? 'float: right'
:right
else
:left
end
end
margins = Constants::REPORT_DOCX_MARGIN_LEFT + Constants::REPORT_DOCX_MARGIN_RIGHT
max_width = (Constants::REPORT_DOCX_WIDTH - margins) / 20
if dimension[0] > max_width
x = max_width
y = dimension[1] * max_width / dimension[0]
else
x = dimension[0]
y = dimension[1]
end
{
width: x,
height: y,
align: align,
max_width: max_width
}
end
def tiny_mce_table_element(table_element)
# array of elements
rows = table_element.css('tbody').first.children.map do |row|
next unless row.name == 'tr'
cells = row.children.map do |cell|
next unless cell.name == 'td'
# Parse cell content
formated_cell = recursive_children(cell.children, [], true)
# Combine text elements to single paragraph
formated_cell = combine_docx_elements(formated_cell)
formated_cell
end.reject(&:blank?)
{ type: 'tr', data: cells }
end.reject(&:blank?)
{ type: 'table', data: rows }
end
def text_formatting_element(element)
{ type: element.name, value: element.text }
end
def normalized_hex_color(color)
return color.delete('#') if color.start_with?('#')
color.scan(/\d+/).map(&:to_i).map { |c| c.to_s(16).rjust(2, '0').upcase }.join
end
end
end