diff --git a/app/controllers/api/v1/assets_controller.rb b/app/controllers/api/v1/assets_controller.rb index e758e741c..24219cae7 100644 --- a/app/controllers/api/v1/assets_controller.rb +++ b/app/controllers/api/v1/assets_controller.rb @@ -34,7 +34,7 @@ module Api end asset.save!(context: :on_api_upload) - asset.post_process_file(@team) + asset.post_process_file render jsonapi: asset, serializer: AssetSerializer, diff --git a/app/controllers/api/v1/results_controller.rb b/app/controllers/api/v1/results_controller.rb index 084347588..c26c2ef02 100644 --- a/app/controllers/api/v1/results_controller.rb +++ b/app/controllers/api/v1/results_controller.rb @@ -113,7 +113,7 @@ module Api blob = create_blob_from_params asset = Asset.create!(file: blob, team: @team) end - asset.post_process_file(@team) + asset.post_process_file ResultAsset.create!(asset: asset, result: @result) end end @@ -129,7 +129,7 @@ module Api blob = create_blob_from_params asset.update!(file: blob) end - asset.post_process_file(@team) + asset.post_process_file new_checksum = asset.file.blob.checksum end @asset_result_updated = old_checksum != new_checksum diff --git a/app/controllers/assets_controller.rb b/app/controllers/assets_controller.rb index 03dde54ea..504f376ab 100644 --- a/app/controllers/assets_controller.rb +++ b/app/controllers/assets_controller.rb @@ -193,7 +193,7 @@ class AssetsController < ApplicationController # release previous image space @asset.team.release_space(orig_file_size) # Post process file here - @asset.post_process_file(@asset.team) + @asset.post_process_file @asset.step&.protocol&.update(updated_at: Time.zone.now) render_html = if [Result, Step].include?(@assoc.class) diff --git a/app/controllers/result_assets_controller.rb b/app/controllers/result_assets_controller.rb index 228f12a92..e2c429260 100644 --- a/app/controllers/result_assets_controller.rb +++ b/app/controllers/result_assets_controller.rb @@ -60,7 +60,7 @@ class ResultAssetsController < ApplicationController team.save # Post process new file if neccesary - @result.asset.post_process_file(team) if asset_changed && @result.asset.present? + @result.asset.post_process_file if asset_changed && @result.asset.present? log_activity(:edit_result) end @@ -133,7 +133,7 @@ class ResultAssetsController < ApplicationController last_modified_by: current_user) results << result # Post process file here - asset.post_process_file(@my_module.experiment.project.team) + asset.post_process_file log_activity(:add_result, result) end diff --git a/app/controllers/results_controller.rb b/app/controllers/results_controller.rb index 19ca74efa..ca411b03f 100644 --- a/app/controllers/results_controller.rb +++ b/app/controllers/results_controller.rb @@ -89,7 +89,7 @@ class ResultsController < ApplicationController view_mode: @result.assets_view_mode ) @asset.file.attach(params[:signed_blob_id]) - @asset.post_process_file(@my_module.team) + @asset.post_process_file end log_activity(:result_file_added, { file: @asset.file_name, result: @result }) diff --git a/app/controllers/steps_controller.rb b/app/controllers/steps_controller.rb index cd4d699de..c2b1c0764 100644 --- a/app/controllers/steps_controller.rb +++ b/app/controllers/steps_controller.rb @@ -42,7 +42,7 @@ class StepsController < ApplicationController view_mode: @step.assets_view_mode ) @asset.file.attach(params[:signed_blob_id]) - @asset.post_process_file(@protocol.team) + @asset.post_process_file default_message_items = { step: @step.id, diff --git a/app/jobs/asset_text_extraction_job.rb b/app/jobs/asset_text_extraction_job.rb deleted file mode 100644 index 3778e3ade..000000000 --- a/app/jobs/asset_text_extraction_job.rb +++ /dev/null @@ -1,20 +0,0 @@ -AssetTextExtractionJob = Struct.new(:asset_id, :in_template) do - def perform - asset = Asset.find_by(id: asset_id) - return unless asset.present? && asset.file.attached? - - asset.extract_asset_text(in_template) - end - - def queue_name - 'assets' - end - - def max_attempts - 1 - end - - def max_run_time - 5.minutes - end -end diff --git a/app/jobs/protocols/docx_import_job.rb b/app/jobs/protocols/docx_import_job.rb index 709d5cb43..49a34bbc5 100644 --- a/app/jobs/protocols/docx_import_job.rb +++ b/app/jobs/protocols/docx_import_job.rb @@ -120,7 +120,7 @@ module Protocols asset.file.attach(io: StringIO.new(Base64.decode64(step_element_json['contents'])), filename: 'file.blob') asset.save! step.step_assets.create!(asset: asset) - asset.post_process_file(@protocol.team) + asset.post_process_file end def create_step_orderable_element!(step, orderable) diff --git a/app/models/asset.rb b/app/models/asset.rb index f9d6c72c2..11041099b 100644 --- a/app/models/asset.rb +++ b/app/models/asset.rb @@ -50,7 +50,7 @@ class Asset < ApplicationRecord joins(file_attachment: :blob).order(sort) } - attr_accessor :file_content, :file_info, :in_template + attr_accessor :file_content, :file_info before_save :reset_file_processing, if: -> { file.new_record? } @@ -238,7 +238,7 @@ class Asset < ApplicationRecord end end - to_asset.post_process_file(to_asset.team) + to_asset.post_process_file end def image? @@ -273,19 +273,9 @@ class Asset < ApplicationRecord pdf? || (previewable_document?(blob) && Rails.application.config.x.enable_pdf_previews) end - def post_process_file(team = nil) - # Extract asset text if it's of correct type - if text? - Rails.logger.info "Asset #{id}: Creating extract text job" - # The extract_asset_text also includes - # estimated size calculation - Delayed::Job.enqueue(AssetTextExtractionJob.new(id, in_template)) - elsif marvinjs? - extract_asset_text - else - # Update asset's estimated size immediately - update_estimated_size(team) - end + def post_process_file + # Update asset's estimated size immediately + update_estimated_size unless text? || marvinjs? if Rails.application.config.x.enable_pdf_previews && previewable_document?(blob) PdfPreviewJob.perform_later(id) @@ -293,43 +283,10 @@ class Asset < ApplicationRecord end end - def extract_asset_text(in_template = false) - self.in_template = in_template - - if marvinjs? - mjs_doc = Nokogiri::XML(file.metadata[:description]) - mjs_doc.remove_namespaces! - text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ') - else - blob.open do |tmp_file| - text_data = Yomu.new(tmp_file.path).text - end - end - - if asset_text_datum.present? - # Update existing text datum if it exists - asset_text_datum.update(data: text_data) - else - # Create new text datum - AssetTextDatum.create(data: text_data, asset: self) - end - - Rails.logger.info "Asset #{id}: Asset file successfully extracted" - - # Finally, update asset's estimated size to include - # the data vector - update_estimated_size(team) - rescue StandardError => e - Rails.logger.fatal( - "Asset #{id}: Error extracting contents from asset "\ - "file #{file.blob.key}: #{e.message}" - ) - end - # If team is provided, its space_taken # is updated as well - def update_estimated_size(team = nil) - return if file_size.blank? || in_template + def update_estimated_size + return if file_size.blank? es = file_size if asset_text_datum.present? && asset_text_datum.persisted? diff --git a/app/models/repository_asset_value.rb b/app/models/repository_asset_value.rb index f4426c00e..899149f92 100644 --- a/app/models/repository_asset_value.rb +++ b/app/models/repository_asset_value.rb @@ -70,7 +70,7 @@ class RepositoryAssetValue < ApplicationRecord asset.last_modified_by = user self.last_modified_by = user asset.save! && save! - asset.post_process_file(repository_cell.repository_column.repository.team) + asset.post_process_file end def snapshot!(cell_snapshot) @@ -104,7 +104,7 @@ class RepositoryAssetValue < ApplicationRecord value.asset.file.attach(io: StringIO.new(Base64.decode64(payload[:file_data])), filename: payload[:file_name]) end - value.asset.post_process_file(team) + value.asset.post_process_file value end diff --git a/app/services/marvin_js_service.rb b/app/services/marvin_js_service.rb index 777936044..25e2ceac6 100644 --- a/app/services/marvin_js_service.rb +++ b/app/services/marvin_js_service.rb @@ -24,7 +24,7 @@ class MarvinJsService team_id: current_team.id) attach_file(asset.file, file, params) asset.save! - asset.post_process_file(current_team) + asset.post_process_file connect_asset(asset, params, current_user) end @@ -41,7 +41,7 @@ class MarvinJsService file = generate_image(params) attach_file(attachment, file, params) asset.update(last_modified_by: current_user) if asset.is_a?(Asset) - asset.post_process_file(current_team) if asset.class == Asset + asset.post_process_file if asset.instance_of?(Asset) asset end diff --git a/app/services/team_importer.rb b/app/services/team_importer.rb index 54c6fc5f3..42a175881 100644 --- a/app/services/team_importer.rb +++ b/app/services/team_importer.rb @@ -960,10 +960,9 @@ class TeamImporter asset.last_modified_by_id = user_id || find_user(asset.last_modified_by_id) asset.team = team - asset.in_template = true if @is_template asset.save! asset.file.attach(io: file, filename: File.basename(file)) - asset.post_process_file(team) + asset.post_process_file @asset_mappings[orig_asset_id] = asset.id @asset_counter += 1 asset diff --git a/app/utilities/protocols_importer.rb b/app/utilities/protocols_importer.rb index d1f82f392..a66b8fe02 100644 --- a/app/utilities/protocols_importer.rb +++ b/app/utilities/protocols_importer.rb @@ -148,7 +148,7 @@ class ProtocolsImporter # Post process assets asset_ids.each do |asset_id| - Asset.find(asset_id).post_process_file(protocol.team) + Asset.find(asset_id).post_process_file end end diff --git a/app/utilities/protocols_importer_v2.rb b/app/utilities/protocols_importer_v2.rb index 80f1cbd6e..ba6e3b813 100644 --- a/app/utilities/protocols_importer_v2.rb +++ b/app/utilities/protocols_importer_v2.rb @@ -95,7 +95,7 @@ class ProtocolsImporterV2 # Post process assets asset_ids.each do |asset_id| - Asset.find(asset_id).post_process_file(protocol.team) + Asset.find(asset_id).post_process_file end end diff --git a/config/initializers/active_storage.rb b/config/initializers/active_storage.rb index f854cbb0b..84c260f87 100644 --- a/config/initializers/active_storage.rb +++ b/config/initializers/active_storage.rb @@ -2,6 +2,7 @@ require 'active_storage/previewer/libreoffice_previewer' require 'active_storage/analyzer/image_analyzer/custom_image_magick' +require 'active_storage/analyzer/text_extraction_analyzer' require 'active_storage/downloader' # Enable PDF previews for files @@ -11,6 +12,7 @@ Rails.application.config.active_storage.previewers = [ActiveStorage::Previewer:: ActiveStorage::Previewer::LibreofficePreviewer] Rails.application.config.active_storage.analyzers.prepend(ActiveStorage::Analyzer::ImageAnalyzer::CustomImageMagick) +Rails.application.config.active_storage.analyzers.append(ActiveStorage::Analyzer::TextExtractionAnalyzer) Rails.application.config.active_storage.variable_content_types << 'image/svg+xml' diff --git a/lib/active_storage/analyzer/text_extraction_analyzer.rb b/lib/active_storage/analyzer/text_extraction_analyzer.rb new file mode 100644 index 000000000..df2f9bd59 --- /dev/null +++ b/lib/active_storage/analyzer/text_extraction_analyzer.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +module ActiveStorage + class Analyzer::TextExtractionAnalyzer < Analyzer + def self.accept?(blob) + blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any? + end + + def self.analyze_later? + true + end + + def metadata + download_blob_to_tempfile do |file| + if blob.content_type == 'application/pdf' + process_pdf(file) + elsif blob.metadata[:asset_type] == 'marvinjs' + process_marvinjs(file) + else + process_other(file) + end + end + end + + private + + def process_pdf(file) + text_data = IO.popen(['pdftotext', file.path, '-'], 'r').read + create_or_update_text_data(text_data) + rescue Errno::ENOENT + logger.info "pdftotext isn't installed, falling back to default text extraction method" + process_other(file) + end + + def process_marvinjs(file) + mjs_doc = Nokogiri::XML(file.metadata[:description]) + mjs_doc.remove_namespaces! + text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ') + create_or_update_text_data(text_data) + end + + def process_other(file) + text_data = Yomu.new(file.path).text + create_or_update_text_data(text_data) + end + + def create_or_update_text_data(text_data) + @blob.attachments.where(record_type: 'Asset').each do |attachemnt| + asset = attachemnt.record + if asset.asset_text_datum.present? + # Update existing text datum if it exists + asset.asset_text_datum.update!(data: text_data) + else + # Create new text datum + asset.create_asset_text_datum!(data: text_data) + end + + asset.update_estimated_size + + Rails.logger.info "Asset #{asset.id}: file text successfully extracted" + end + + { text_extracted: true } + end + end +end