Switch to asset text extraction with ActiveStorage analyzer [SCI-9954] (#6956)

This commit is contained in:
Alex Kriuchykhin 2024-02-05 16:24:06 +01:00 committed by GitHub
parent 7155d7862a
commit 342e9dbb19
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 91 additions and 87 deletions

View file

@ -34,7 +34,7 @@ module Api
end
asset.save!(context: :on_api_upload)
asset.post_process_file(@team)
asset.post_process_file
render jsonapi: asset,
serializer: AssetSerializer,

View file

@ -113,7 +113,7 @@ module Api
blob = create_blob_from_params
asset = Asset.create!(file: blob, team: @team)
end
asset.post_process_file(@team)
asset.post_process_file
ResultAsset.create!(asset: asset, result: @result)
end
end
@ -129,7 +129,7 @@ module Api
blob = create_blob_from_params
asset.update!(file: blob)
end
asset.post_process_file(@team)
asset.post_process_file
new_checksum = asset.file.blob.checksum
end
@asset_result_updated = old_checksum != new_checksum

View file

@ -193,7 +193,7 @@ class AssetsController < ApplicationController
# release previous image space
@asset.team.release_space(orig_file_size)
# Post process file here
@asset.post_process_file(@asset.team)
@asset.post_process_file
@asset.step&.protocol&.update(updated_at: Time.zone.now)
render_html = if [Result, Step].include?(@assoc.class)

View file

@ -60,7 +60,7 @@ class ResultAssetsController < ApplicationController
team.save
# Post process new file if neccesary
@result.asset.post_process_file(team) if asset_changed && @result.asset.present?
@result.asset.post_process_file if asset_changed && @result.asset.present?
log_activity(:edit_result)
end
@ -133,7 +133,7 @@ class ResultAssetsController < ApplicationController
last_modified_by: current_user)
results << result
# Post process file here
asset.post_process_file(@my_module.experiment.project.team)
asset.post_process_file
log_activity(:add_result, result)
end

View file

@ -89,7 +89,7 @@ class ResultsController < ApplicationController
view_mode: @result.assets_view_mode
)
@asset.file.attach(params[:signed_blob_id])
@asset.post_process_file(@my_module.team)
@asset.post_process_file
end
log_activity(:result_file_added, { file: @asset.file_name, result: @result })

View file

@ -42,7 +42,7 @@ class StepsController < ApplicationController
view_mode: @step.assets_view_mode
)
@asset.file.attach(params[:signed_blob_id])
@asset.post_process_file(@protocol.team)
@asset.post_process_file
default_message_items = {
step: @step.id,

View file

@ -1,20 +0,0 @@
AssetTextExtractionJob = Struct.new(:asset_id, :in_template) do
def perform
asset = Asset.find_by(id: asset_id)
return unless asset.present? && asset.file.attached?
asset.extract_asset_text(in_template)
end
def queue_name
'assets'
end
def max_attempts
1
end
def max_run_time
5.minutes
end
end

View file

@ -120,7 +120,7 @@ module Protocols
asset.file.attach(io: StringIO.new(Base64.decode64(step_element_json['contents'])), filename: 'file.blob')
asset.save!
step.step_assets.create!(asset: asset)
asset.post_process_file(@protocol.team)
asset.post_process_file
end
def create_step_orderable_element!(step, orderable)

View file

@ -50,7 +50,7 @@ class Asset < ApplicationRecord
joins(file_attachment: :blob).order(sort)
}
attr_accessor :file_content, :file_info, :in_template
attr_accessor :file_content, :file_info
before_save :reset_file_processing, if: -> { file.new_record? }
@ -238,7 +238,7 @@ class Asset < ApplicationRecord
end
end
to_asset.post_process_file(to_asset.team)
to_asset.post_process_file
end
def image?
@ -273,19 +273,9 @@ class Asset < ApplicationRecord
pdf? || (previewable_document?(blob) && Rails.application.config.x.enable_pdf_previews)
end
def post_process_file(team = nil)
# Extract asset text if it's of correct type
if text?
Rails.logger.info "Asset #{id}: Creating extract text job"
# The extract_asset_text also includes
# estimated size calculation
Delayed::Job.enqueue(AssetTextExtractionJob.new(id, in_template))
elsif marvinjs?
extract_asset_text
else
# Update asset's estimated size immediately
update_estimated_size(team)
end
def post_process_file
# Update asset's estimated size immediately
update_estimated_size unless text? || marvinjs?
if Rails.application.config.x.enable_pdf_previews && previewable_document?(blob)
PdfPreviewJob.perform_later(id)
@ -293,43 +283,10 @@ class Asset < ApplicationRecord
end
end
def extract_asset_text(in_template = false)
self.in_template = in_template
if marvinjs?
mjs_doc = Nokogiri::XML(file.metadata[:description])
mjs_doc.remove_namespaces!
text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ')
else
blob.open do |tmp_file|
text_data = Yomu.new(tmp_file.path).text
end
end
if asset_text_datum.present?
# Update existing text datum if it exists
asset_text_datum.update(data: text_data)
else
# Create new text datum
AssetTextDatum.create(data: text_data, asset: self)
end
Rails.logger.info "Asset #{id}: Asset file successfully extracted"
# Finally, update asset's estimated size to include
# the data vector
update_estimated_size(team)
rescue StandardError => e
Rails.logger.fatal(
"Asset #{id}: Error extracting contents from asset "\
"file #{file.blob.key}: #{e.message}"
)
end
# If team is provided, its space_taken
# is updated as well
def update_estimated_size(team = nil)
return if file_size.blank? || in_template
def update_estimated_size
return if file_size.blank?
es = file_size
if asset_text_datum.present? && asset_text_datum.persisted?

View file

@ -70,7 +70,7 @@ class RepositoryAssetValue < ApplicationRecord
asset.last_modified_by = user
self.last_modified_by = user
asset.save! && save!
asset.post_process_file(repository_cell.repository_column.repository.team)
asset.post_process_file
end
def snapshot!(cell_snapshot)
@ -104,7 +104,7 @@ class RepositoryAssetValue < ApplicationRecord
value.asset.file.attach(io: StringIO.new(Base64.decode64(payload[:file_data])), filename: payload[:file_name])
end
value.asset.post_process_file(team)
value.asset.post_process_file
value
end

View file

@ -24,7 +24,7 @@ class MarvinJsService
team_id: current_team.id)
attach_file(asset.file, file, params)
asset.save!
asset.post_process_file(current_team)
asset.post_process_file
connect_asset(asset, params, current_user)
end
@ -41,7 +41,7 @@ class MarvinJsService
file = generate_image(params)
attach_file(attachment, file, params)
asset.update(last_modified_by: current_user) if asset.is_a?(Asset)
asset.post_process_file(current_team) if asset.class == Asset
asset.post_process_file if asset.instance_of?(Asset)
asset
end

View file

@ -960,10 +960,9 @@ class TeamImporter
asset.last_modified_by_id =
user_id || find_user(asset.last_modified_by_id)
asset.team = team
asset.in_template = true if @is_template
asset.save!
asset.file.attach(io: file, filename: File.basename(file))
asset.post_process_file(team)
asset.post_process_file
@asset_mappings[orig_asset_id] = asset.id
@asset_counter += 1
asset

View file

@ -148,7 +148,7 @@ class ProtocolsImporter
# Post process assets
asset_ids.each do |asset_id|
Asset.find(asset_id).post_process_file(protocol.team)
Asset.find(asset_id).post_process_file
end
end

View file

@ -95,7 +95,7 @@ class ProtocolsImporterV2
# Post process assets
asset_ids.each do |asset_id|
Asset.find(asset_id).post_process_file(protocol.team)
Asset.find(asset_id).post_process_file
end
end

View file

@ -2,6 +2,7 @@
require 'active_storage/previewer/libreoffice_previewer'
require 'active_storage/analyzer/image_analyzer/custom_image_magick'
require 'active_storage/analyzer/text_extraction_analyzer'
require 'active_storage/downloader'
# Enable PDF previews for files
@ -11,6 +12,7 @@ Rails.application.config.active_storage.previewers = [ActiveStorage::Previewer::
ActiveStorage::Previewer::LibreofficePreviewer]
Rails.application.config.active_storage.analyzers.prepend(ActiveStorage::Analyzer::ImageAnalyzer::CustomImageMagick)
Rails.application.config.active_storage.analyzers.append(ActiveStorage::Analyzer::TextExtractionAnalyzer)
Rails.application.config.active_storage.variable_content_types << 'image/svg+xml'

View file

@ -0,0 +1,66 @@
# frozen_string_literal: true
module ActiveStorage
class Analyzer::TextExtractionAnalyzer < Analyzer
def self.accept?(blob)
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any?
end
def self.analyze_later?
true
end
def metadata
download_blob_to_tempfile do |file|
if blob.content_type == 'application/pdf'
process_pdf(file)
elsif blob.metadata[:asset_type] == 'marvinjs'
process_marvinjs(file)
else
process_other(file)
end
end
end
private
def process_pdf(file)
text_data = IO.popen(['pdftotext', file.path, '-'], 'r').read
create_or_update_text_data(text_data)
rescue Errno::ENOENT
logger.info "pdftotext isn't installed, falling back to default text extraction method"
process_other(file)
end
def process_marvinjs(file)
mjs_doc = Nokogiri::XML(file.metadata[:description])
mjs_doc.remove_namespaces!
text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ')
create_or_update_text_data(text_data)
end
def process_other(file)
text_data = Yomu.new(file.path).text
create_or_update_text_data(text_data)
end
def create_or_update_text_data(text_data)
@blob.attachments.where(record_type: 'Asset').each do |attachemnt|
asset = attachemnt.record
if asset.asset_text_datum.present?
# Update existing text datum if it exists
asset.asset_text_datum.update!(data: text_data)
else
# Create new text datum
asset.create_asset_text_datum!(data: text_data)
end
asset.update_estimated_size
Rails.logger.info "Asset #{asset.id}: file text successfully extracted"
end
{ text_extracted: true }
end
end
end