scinote-web/app/services/reports/html_to_word_converter.rb

281 lines
8.6 KiB
Ruby
Raw Normal View History

2020-10-13 19:54:16 +08:00
# frozen_string_literal: true
module Reports
class HtmlToWordConverter
def initialize(document, options = {})
2020-10-13 19:54:16 +08:00
@docx = document
@scinote_url = options[:scinote_url]
@link_style = options[:link_style]
2020-10-13 19:54:16 +08:00
end
def html_to_word_converter(text)
html = Nokogiri::HTML(text)
raw_elements = recursive_children(html.css('body').children, []).compact
2020-10-13 19:54:16 +08:00
# Combined raw text blocks in paragraphs
elements = combine_docx_elements(raw_elements)
# Draw elements
elements.each do |elem|
if elem[:type] == 'p'
Reports::DocxRenderer.render_p_element(@docx, elem, scinote_url: @scinote_url, link_style: @link_style)
elsif elem[:type] == 'table'
Reports::DocxRenderer.render_table_element(@docx, elem)
elsif elem[:type] == 'newline'
style = elem[:style] || {}
# print heading if its heading
# Mixing heading with other style setting causes problems for Word
2020-11-27 20:33:44 +08:00
if %w(h1 h2 h3 h4 h5 h6).include?(style[:style])
2020-10-13 19:54:16 +08:00
@docx.public_send(style[:style], elem[:value])
else
@docx.p elem[:value] do
align style[:align]
color style[:color]
bold style[:bold]
italic style[:italic]
end
end
elsif elem[:type] == 'image'
Reports::DocxRenderer.render_img_element(@docx, elem)
elsif %w(ul ol).include?(elem[:type])
Reports::DocxRenderer.render_list_element(@docx, elem)
end
end
end
private
def combine_docx_elements(raw_elements)
# Word does not support some nested elements, move some elements to root level
elements = []
temp_p = []
raw_elements.each do |elem|
next unless elem
2020-10-13 19:54:16 +08:00
if %w(image newline table ol ul).include? elem[:type]
2021-07-23 17:56:28 +08:00
unless temp_p.blank?
2020-10-13 19:54:16 +08:00
elements.push(type: 'p', children: temp_p)
temp_p = []
end
elements.push(elem)
elsif %w(br text a sup sub).include? elem[:type]
2020-10-13 19:54:16 +08:00
temp_p.push(elem)
end
end
elements.push(type: 'p', children: temp_p)
elements
end
# Convert HTML structure to plain text structure
# rubocop:disable Metrics/BlockLength
def recursive_children(children, elements, skip_newline = false)
2020-10-13 19:54:16 +08:00
children.each do |elem|
if elem.class == Nokogiri::XML::Text
next if elem.text.strip == ' ' # Invisible symbol
style = paragraph_styling(elem.parent)
type = !skip_newline && ((style[:align] && style[:align] != :justify) || style[:style]) ? 'newline' : 'text'
2020-10-13 19:54:16 +08:00
text = smart_annotation_check(elem)
elements.push(
type: type,
value: text.strip.delete(' '), # Invisible symbol
style: style
)
next
end
if elem.name == 'br'
elements.push(type: 'br')
next
end
if elem.name == 'img'
elements.push(img_element(elem))
next
end
if elem.name == 'a'
elements.push(link_element(elem))
next
end
if elem.name == 'table'
elements.push(tiny_mce_table_element(elem))
next
end
if %w(sup sub).include?(elem.name)
elements.push(text_formatting_element(elem))
next
end
2020-10-13 19:54:16 +08:00
if %w(ul ol).include?(elem.name)
elements.push(list_element(elem))
next
end
elements = recursive_children(elem.children, elements, skip_newline) if elem.children
2020-10-13 19:54:16 +08:00
end
elements
end
# rubocop:enable Metrics/BlockLength
def img_element(elem)
return unless elem.attributes['data-mce-token']
image = TinyMceAsset.find_by(id: Base62.decode(elem.attributes['data-mce-token'].value))
return unless image
image_path = Reports::Utils.image_prepare(image).processed.url
2020-10-13 19:54:16 +08:00
dimension = FastImage.size(image_path)
return unless dimension
style = image_styling(elem, dimension)
{ type: 'image', data: image_path.split('&')[0], blob: image.blob, style: style }
end
def link_element(elem)
text = elem.text
link = elem.attributes['href'].value if elem.attributes['href']
if elem.attributes['class']&.value == 'record-info-link'
link = nil
text = "##{text}"
end
text = "##{text}" if elem.parent.attributes['class']&.value == 'atwho-inserted'
text = "@#{text}" if elem.attributes['class']&.value == 'atwho-user-popover'
{
type: 'a',
value: text,
link: link
}
end
def list_element(list_element)
allowed_elements = %w(li ul ol a img strong em h1 h2 h2 h3 h4 h5 span p)
data_array = list_element.children.select { |n| allowed_elements.include?(n.name) }.map do |li_child|
2020-10-13 19:54:16 +08:00
li_child.children.map do |item|
if item.is_a? Nokogiri::XML::Text
item.text.chomp
elsif %w(ul ol).include?(item.name)
list_element(item)
elsif %w(a).include?(item.name)
link_element(item)
elsif %w(img).include?(item.name)
img_element(item)&.merge(bookmark_id: SecureRandom.hex)
elsif %w(table).include?(item.name)
tiny_mce_table_element(item).merge(bookmark_id: SecureRandom.hex)
elsif %w(strong em h1 h2 h2 h3 h4 h5 span p).include?(item.name)
# Pass styles and extend renderer for li with style, some limitations on li items
# { type: 'text', value: item[:value], style: paragraph_styling(item) }
item.children.text
2020-10-13 19:54:16 +08:00
end
end.reject(&:blank?)
end
{ type: list_element.name, data: data_array }
end
def smart_annotation_check(elem)
return "[#{elem.text}]" if elem.parent.attributes['class']&.value == 'sa-type'
elem.text
end
# Prepare style for text
def paragraph_styling(elem)
style = elem.attributes['style']
result = {}
result[:style] = elem.name if elem.name.include? 'h'
result[:bold] = true if elem.name == 'strong'
result[:italic] = true if elem.name == 'em'
style_keys = %w(text-align color text-decoration)
2020-10-13 19:54:16 +08:00
if style
style_keys.each do |key|
style_el = style.value.split(';').select { |i| (i.include? key) }[0]
next unless style_el
value = style_el.split(':')[1].strip if style_el
2020-10-13 19:54:16 +08:00
if key == 'text-align'
result[:align] = value.to_sym
elsif key == 'color' && Reports::Utils.calculate_color_hsp("##{normalized_hex_color(value)}") < 190
result[:color] = normalized_hex_color(value)
elsif key == 'text-decoration' && value == 'underline'
result[:underline] = true
2020-10-13 19:54:16 +08:00
end
end
end
result
end
# Prepare style for images
def image_styling(elem, dimension)
dimension[0] = elem.attributes['width'].value.to_i if elem.attributes['width']
dimension[1] = elem.attributes['height'].value.to_i if elem.attributes['height']
if elem.attributes['style']
align = if elem.attributes['style'].value.include? 'margin-right'
:center
elsif elem.attributes['style'].value.include? 'float: right'
:right
else
:left
end
end
margins = Constants::REPORT_DOCX_MARGIN_LEFT + Constants::REPORT_DOCX_MARGIN_RIGHT
max_width = (Constants::REPORT_DOCX_WIDTH - margins) / 20
if dimension[0] > max_width
x = max_width
y = dimension[1] * max_width / dimension[0]
else
x = dimension[0]
y = dimension[1]
end
{
width: x,
height: y,
align: align,
max_width: max_width
}
end
def tiny_mce_table_element(table_element)
# array of elements
rows = table_element.css('tbody').first.children.map do |row|
next unless row.name == 'tr'
cells = row.children.map do |cell|
next unless cell.name == 'td'
# Parse cell content
formated_cell = recursive_children(cell.children, [], true)
2020-10-13 19:54:16 +08:00
# Combine text elements to single paragraph
formated_cell = combine_docx_elements(formated_cell)
formated_cell
end.reject(&:blank?)
{ type: 'tr', data: cells }
end.reject(&:blank?)
{ type: 'table', data: rows }
end
def text_formatting_element(element)
{ type: element.name, value: element.text }
end
def normalized_hex_color(color)
return color.delete('#') if color.start_with?('#')
color.scan(/\d+/).map(&:to_i).map { |c| c.to_s(16).rjust(2, '0').upcase }.join
end
2020-10-13 19:54:16 +08:00
end
end