mirror of
https://github.com/scinote-eln/scinote-web.git
synced 2025-10-29 15:36:36 +08:00
Add text extraction from MarvinJS for search [SCI-3643]
This commit is contained in:
parent
3b7f715e0b
commit
a2fe4bee94
2 changed files with 35 additions and 22 deletions
|
|
@ -119,7 +119,7 @@ class Asset < ApplicationRecord
|
|||
assets_in_steps, assets_in_results, assets_in_inventories)
|
||||
|
||||
new_query = Asset.left_outer_joins(:asset_text_datum)
|
||||
.left_outer_joins(file_attachment: :blob)
|
||||
.joins(file_attachment: :blob)
|
||||
.from(assets, 'assets')
|
||||
|
||||
a_query = s_query = ''
|
||||
|
|
@ -266,6 +266,10 @@ class Asset < ApplicationRecord
|
|||
end
|
||||
end
|
||||
|
||||
def marvinjs?
|
||||
file.metadata[:asset_type] == 'marvinjs'
|
||||
end
|
||||
|
||||
def post_process_file(team = nil)
|
||||
# Update self.empty
|
||||
update(file_present: true)
|
||||
|
|
@ -277,6 +281,8 @@ class Asset < ApplicationRecord
|
|||
# estimated size calculation
|
||||
Asset.delay(queue: :assets, run_at: 20.minutes.from_now)
|
||||
.extract_asset_text_delayed(id, in_template)
|
||||
elsif marvinjs?
|
||||
extract_asset_text
|
||||
else
|
||||
# Update asset's estimated size immediately
|
||||
update_estimated_size(team)
|
||||
|
|
@ -293,31 +299,36 @@ class Asset < ApplicationRecord
|
|||
def extract_asset_text(in_template = false)
|
||||
self.in_template = in_template
|
||||
|
||||
download_blob_to_tempfile do |tmp_file|
|
||||
if marvinjs?
|
||||
mjs_doc = Nokogiri::XML(file.metadata[:description])
|
||||
mjs_doc.remove_namespaces!
|
||||
text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ')
|
||||
else
|
||||
# Start Tika as a server
|
||||
Yomu.server(:text) if !ENV['NO_TIKA_SERVER'] && Yomu.class_variable_get(:@@server_pid).nil?
|
||||
|
||||
text_data = Yomu.new(tmp_file.path).text
|
||||
|
||||
if asset_text_datum.present?
|
||||
# Update existing text datum if it exists
|
||||
asset_text_datum.update(data: text_data)
|
||||
else
|
||||
# Create new text datum
|
||||
AssetTextDatum.create(data: text_data, asset: self)
|
||||
download_blob_to_tempfile do |tmp_file|
|
||||
text_data = Yomu.new(tmp_file.path).text
|
||||
end
|
||||
|
||||
Rails.logger.info "Asset #{id}: Asset file successfully extracted"
|
||||
|
||||
# Finally, update asset's estimated size to include
|
||||
# the data vector
|
||||
update_estimated_size(team)
|
||||
rescue StandardError => e
|
||||
Rails.logger.fatal(
|
||||
"Asset #{id}: Error extracting contents from asset "\
|
||||
"file #{file.blob.key}: #{e.message}"
|
||||
)
|
||||
end
|
||||
|
||||
if asset_text_datum.present?
|
||||
# Update existing text datum if it exists
|
||||
asset_text_datum.update(data: text_data)
|
||||
else
|
||||
# Create new text datum
|
||||
AssetTextDatum.create(data: text_data, asset: self)
|
||||
end
|
||||
|
||||
Rails.logger.info "Asset #{id}: Asset file successfully extracted"
|
||||
|
||||
# Finally, update asset's estimated size to include
|
||||
# the data vector
|
||||
update_estimated_size(team)
|
||||
rescue StandardError => e
|
||||
Rails.logger.fatal(
|
||||
"Asset #{id}: Error extracting contents from asset "\
|
||||
"file #{file.blob.key}: #{e.message}"
|
||||
)
|
||||
end
|
||||
|
||||
# If team is provided, its space_taken
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ class MarvinJsService
|
|||
team_id: current_team.id)
|
||||
attach_file(asset.file, file, params)
|
||||
asset.save!
|
||||
asset.post_process_file(current_team)
|
||||
connect_asset(asset, params, current_user)
|
||||
end
|
||||
|
||||
|
|
@ -39,6 +40,7 @@ class MarvinJsService
|
|||
|
||||
file = generate_image(params)
|
||||
attach_file(attachment, file, params)
|
||||
asset.post_process_file(current_team)
|
||||
asset
|
||||
end
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue