mirror of
https://github.com/scinote-eln/scinote-web.git
synced 2025-09-06 21:24:23 +08:00
Switch to asset text extraction with ActiveStorage analyzer [SCI-9954] (#6956)
This commit is contained in:
parent
7155d7862a
commit
342e9dbb19
16 changed files with 91 additions and 87 deletions
|
@ -34,7 +34,7 @@ module Api
|
|||
end
|
||||
|
||||
asset.save!(context: :on_api_upload)
|
||||
asset.post_process_file(@team)
|
||||
asset.post_process_file
|
||||
|
||||
render jsonapi: asset,
|
||||
serializer: AssetSerializer,
|
||||
|
|
|
@ -113,7 +113,7 @@ module Api
|
|||
blob = create_blob_from_params
|
||||
asset = Asset.create!(file: blob, team: @team)
|
||||
end
|
||||
asset.post_process_file(@team)
|
||||
asset.post_process_file
|
||||
ResultAsset.create!(asset: asset, result: @result)
|
||||
end
|
||||
end
|
||||
|
@ -129,7 +129,7 @@ module Api
|
|||
blob = create_blob_from_params
|
||||
asset.update!(file: blob)
|
||||
end
|
||||
asset.post_process_file(@team)
|
||||
asset.post_process_file
|
||||
new_checksum = asset.file.blob.checksum
|
||||
end
|
||||
@asset_result_updated = old_checksum != new_checksum
|
||||
|
|
|
@ -193,7 +193,7 @@ class AssetsController < ApplicationController
|
|||
# release previous image space
|
||||
@asset.team.release_space(orig_file_size)
|
||||
# Post process file here
|
||||
@asset.post_process_file(@asset.team)
|
||||
@asset.post_process_file
|
||||
@asset.step&.protocol&.update(updated_at: Time.zone.now)
|
||||
|
||||
render_html = if [Result, Step].include?(@assoc.class)
|
||||
|
|
|
@ -60,7 +60,7 @@ class ResultAssetsController < ApplicationController
|
|||
team.save
|
||||
|
||||
# Post process new file if neccesary
|
||||
@result.asset.post_process_file(team) if asset_changed && @result.asset.present?
|
||||
@result.asset.post_process_file if asset_changed && @result.asset.present?
|
||||
|
||||
log_activity(:edit_result)
|
||||
end
|
||||
|
@ -133,7 +133,7 @@ class ResultAssetsController < ApplicationController
|
|||
last_modified_by: current_user)
|
||||
results << result
|
||||
# Post process file here
|
||||
asset.post_process_file(@my_module.experiment.project.team)
|
||||
asset.post_process_file
|
||||
log_activity(:add_result, result)
|
||||
end
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ class ResultsController < ApplicationController
|
|||
view_mode: @result.assets_view_mode
|
||||
)
|
||||
@asset.file.attach(params[:signed_blob_id])
|
||||
@asset.post_process_file(@my_module.team)
|
||||
@asset.post_process_file
|
||||
end
|
||||
|
||||
log_activity(:result_file_added, { file: @asset.file_name, result: @result })
|
||||
|
|
|
@ -42,7 +42,7 @@ class StepsController < ApplicationController
|
|||
view_mode: @step.assets_view_mode
|
||||
)
|
||||
@asset.file.attach(params[:signed_blob_id])
|
||||
@asset.post_process_file(@protocol.team)
|
||||
@asset.post_process_file
|
||||
|
||||
default_message_items = {
|
||||
step: @step.id,
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
AssetTextExtractionJob = Struct.new(:asset_id, :in_template) do
|
||||
def perform
|
||||
asset = Asset.find_by(id: asset_id)
|
||||
return unless asset.present? && asset.file.attached?
|
||||
|
||||
asset.extract_asset_text(in_template)
|
||||
end
|
||||
|
||||
def queue_name
|
||||
'assets'
|
||||
end
|
||||
|
||||
def max_attempts
|
||||
1
|
||||
end
|
||||
|
||||
def max_run_time
|
||||
5.minutes
|
||||
end
|
||||
end
|
|
@ -120,7 +120,7 @@ module Protocols
|
|||
asset.file.attach(io: StringIO.new(Base64.decode64(step_element_json['contents'])), filename: 'file.blob')
|
||||
asset.save!
|
||||
step.step_assets.create!(asset: asset)
|
||||
asset.post_process_file(@protocol.team)
|
||||
asset.post_process_file
|
||||
end
|
||||
|
||||
def create_step_orderable_element!(step, orderable)
|
||||
|
|
|
@ -50,7 +50,7 @@ class Asset < ApplicationRecord
|
|||
joins(file_attachment: :blob).order(sort)
|
||||
}
|
||||
|
||||
attr_accessor :file_content, :file_info, :in_template
|
||||
attr_accessor :file_content, :file_info
|
||||
|
||||
before_save :reset_file_processing, if: -> { file.new_record? }
|
||||
|
||||
|
@ -238,7 +238,7 @@ class Asset < ApplicationRecord
|
|||
end
|
||||
end
|
||||
|
||||
to_asset.post_process_file(to_asset.team)
|
||||
to_asset.post_process_file
|
||||
end
|
||||
|
||||
def image?
|
||||
|
@ -273,19 +273,9 @@ class Asset < ApplicationRecord
|
|||
pdf? || (previewable_document?(blob) && Rails.application.config.x.enable_pdf_previews)
|
||||
end
|
||||
|
||||
def post_process_file(team = nil)
|
||||
# Extract asset text if it's of correct type
|
||||
if text?
|
||||
Rails.logger.info "Asset #{id}: Creating extract text job"
|
||||
# The extract_asset_text also includes
|
||||
# estimated size calculation
|
||||
Delayed::Job.enqueue(AssetTextExtractionJob.new(id, in_template))
|
||||
elsif marvinjs?
|
||||
extract_asset_text
|
||||
else
|
||||
# Update asset's estimated size immediately
|
||||
update_estimated_size(team)
|
||||
end
|
||||
def post_process_file
|
||||
# Update asset's estimated size immediately
|
||||
update_estimated_size unless text? || marvinjs?
|
||||
|
||||
if Rails.application.config.x.enable_pdf_previews && previewable_document?(blob)
|
||||
PdfPreviewJob.perform_later(id)
|
||||
|
@ -293,43 +283,10 @@ class Asset < ApplicationRecord
|
|||
end
|
||||
end
|
||||
|
||||
def extract_asset_text(in_template = false)
|
||||
self.in_template = in_template
|
||||
|
||||
if marvinjs?
|
||||
mjs_doc = Nokogiri::XML(file.metadata[:description])
|
||||
mjs_doc.remove_namespaces!
|
||||
text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ')
|
||||
else
|
||||
blob.open do |tmp_file|
|
||||
text_data = Yomu.new(tmp_file.path).text
|
||||
end
|
||||
end
|
||||
|
||||
if asset_text_datum.present?
|
||||
# Update existing text datum if it exists
|
||||
asset_text_datum.update(data: text_data)
|
||||
else
|
||||
# Create new text datum
|
||||
AssetTextDatum.create(data: text_data, asset: self)
|
||||
end
|
||||
|
||||
Rails.logger.info "Asset #{id}: Asset file successfully extracted"
|
||||
|
||||
# Finally, update asset's estimated size to include
|
||||
# the data vector
|
||||
update_estimated_size(team)
|
||||
rescue StandardError => e
|
||||
Rails.logger.fatal(
|
||||
"Asset #{id}: Error extracting contents from asset "\
|
||||
"file #{file.blob.key}: #{e.message}"
|
||||
)
|
||||
end
|
||||
|
||||
# If team is provided, its space_taken
|
||||
# is updated as well
|
||||
def update_estimated_size(team = nil)
|
||||
return if file_size.blank? || in_template
|
||||
def update_estimated_size
|
||||
return if file_size.blank?
|
||||
|
||||
es = file_size
|
||||
if asset_text_datum.present? && asset_text_datum.persisted?
|
||||
|
|
|
@ -70,7 +70,7 @@ class RepositoryAssetValue < ApplicationRecord
|
|||
asset.last_modified_by = user
|
||||
self.last_modified_by = user
|
||||
asset.save! && save!
|
||||
asset.post_process_file(repository_cell.repository_column.repository.team)
|
||||
asset.post_process_file
|
||||
end
|
||||
|
||||
def snapshot!(cell_snapshot)
|
||||
|
@ -104,7 +104,7 @@ class RepositoryAssetValue < ApplicationRecord
|
|||
value.asset.file.attach(io: StringIO.new(Base64.decode64(payload[:file_data])), filename: payload[:file_name])
|
||||
end
|
||||
|
||||
value.asset.post_process_file(team)
|
||||
value.asset.post_process_file
|
||||
value
|
||||
end
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ class MarvinJsService
|
|||
team_id: current_team.id)
|
||||
attach_file(asset.file, file, params)
|
||||
asset.save!
|
||||
asset.post_process_file(current_team)
|
||||
asset.post_process_file
|
||||
connect_asset(asset, params, current_user)
|
||||
end
|
||||
|
||||
|
@ -41,7 +41,7 @@ class MarvinJsService
|
|||
file = generate_image(params)
|
||||
attach_file(attachment, file, params)
|
||||
asset.update(last_modified_by: current_user) if asset.is_a?(Asset)
|
||||
asset.post_process_file(current_team) if asset.class == Asset
|
||||
asset.post_process_file if asset.instance_of?(Asset)
|
||||
asset
|
||||
end
|
||||
|
||||
|
|
|
@ -960,10 +960,9 @@ class TeamImporter
|
|||
asset.last_modified_by_id =
|
||||
user_id || find_user(asset.last_modified_by_id)
|
||||
asset.team = team
|
||||
asset.in_template = true if @is_template
|
||||
asset.save!
|
||||
asset.file.attach(io: file, filename: File.basename(file))
|
||||
asset.post_process_file(team)
|
||||
asset.post_process_file
|
||||
@asset_mappings[orig_asset_id] = asset.id
|
||||
@asset_counter += 1
|
||||
asset
|
||||
|
|
|
@ -148,7 +148,7 @@ class ProtocolsImporter
|
|||
|
||||
# Post process assets
|
||||
asset_ids.each do |asset_id|
|
||||
Asset.find(asset_id).post_process_file(protocol.team)
|
||||
Asset.find(asset_id).post_process_file
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -95,7 +95,7 @@ class ProtocolsImporterV2
|
|||
|
||||
# Post process assets
|
||||
asset_ids.each do |asset_id|
|
||||
Asset.find(asset_id).post_process_file(protocol.team)
|
||||
Asset.find(asset_id).post_process_file
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
require 'active_storage/previewer/libreoffice_previewer'
|
||||
require 'active_storage/analyzer/image_analyzer/custom_image_magick'
|
||||
require 'active_storage/analyzer/text_extraction_analyzer'
|
||||
require 'active_storage/downloader'
|
||||
|
||||
# Enable PDF previews for files
|
||||
|
@ -11,6 +12,7 @@ Rails.application.config.active_storage.previewers = [ActiveStorage::Previewer::
|
|||
ActiveStorage::Previewer::LibreofficePreviewer]
|
||||
|
||||
Rails.application.config.active_storage.analyzers.prepend(ActiveStorage::Analyzer::ImageAnalyzer::CustomImageMagick)
|
||||
Rails.application.config.active_storage.analyzers.append(ActiveStorage::Analyzer::TextExtractionAnalyzer)
|
||||
|
||||
Rails.application.config.active_storage.variable_content_types << 'image/svg+xml'
|
||||
|
||||
|
|
66
lib/active_storage/analyzer/text_extraction_analyzer.rb
Normal file
66
lib/active_storage/analyzer/text_extraction_analyzer.rb
Normal file
|
@ -0,0 +1,66 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module ActiveStorage
|
||||
class Analyzer::TextExtractionAnalyzer < Analyzer
|
||||
def self.accept?(blob)
|
||||
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any?
|
||||
end
|
||||
|
||||
def self.analyze_later?
|
||||
true
|
||||
end
|
||||
|
||||
def metadata
|
||||
download_blob_to_tempfile do |file|
|
||||
if blob.content_type == 'application/pdf'
|
||||
process_pdf(file)
|
||||
elsif blob.metadata[:asset_type] == 'marvinjs'
|
||||
process_marvinjs(file)
|
||||
else
|
||||
process_other(file)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def process_pdf(file)
|
||||
text_data = IO.popen(['pdftotext', file.path, '-'], 'r').read
|
||||
create_or_update_text_data(text_data)
|
||||
rescue Errno::ENOENT
|
||||
logger.info "pdftotext isn't installed, falling back to default text extraction method"
|
||||
process_other(file)
|
||||
end
|
||||
|
||||
def process_marvinjs(file)
|
||||
mjs_doc = Nokogiri::XML(file.metadata[:description])
|
||||
mjs_doc.remove_namespaces!
|
||||
text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ')
|
||||
create_or_update_text_data(text_data)
|
||||
end
|
||||
|
||||
def process_other(file)
|
||||
text_data = Yomu.new(file.path).text
|
||||
create_or_update_text_data(text_data)
|
||||
end
|
||||
|
||||
def create_or_update_text_data(text_data)
|
||||
@blob.attachments.where(record_type: 'Asset').each do |attachemnt|
|
||||
asset = attachemnt.record
|
||||
if asset.asset_text_datum.present?
|
||||
# Update existing text datum if it exists
|
||||
asset.asset_text_datum.update!(data: text_data)
|
||||
else
|
||||
# Create new text datum
|
||||
asset.create_asset_text_datum!(data: text_data)
|
||||
end
|
||||
|
||||
asset.update_estimated_size
|
||||
|
||||
Rails.logger.info "Asset #{asset.id}: file text successfully extracted"
|
||||
end
|
||||
|
||||
{ text_extracted: true }
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Add table
Reference in a new issue