mirror of
https://github.com/scinote-eln/scinote-web.git
synced 2025-11-08 07:21:03 +08:00
Add text extraction from MarvinJS for search [SCI-3643]
This commit is contained in:
parent
3b7f715e0b
commit
a2fe4bee94
2 changed files with 35 additions and 22 deletions
|
|
@ -119,7 +119,7 @@ class Asset < ApplicationRecord
|
||||||
assets_in_steps, assets_in_results, assets_in_inventories)
|
assets_in_steps, assets_in_results, assets_in_inventories)
|
||||||
|
|
||||||
new_query = Asset.left_outer_joins(:asset_text_datum)
|
new_query = Asset.left_outer_joins(:asset_text_datum)
|
||||||
.left_outer_joins(file_attachment: :blob)
|
.joins(file_attachment: :blob)
|
||||||
.from(assets, 'assets')
|
.from(assets, 'assets')
|
||||||
|
|
||||||
a_query = s_query = ''
|
a_query = s_query = ''
|
||||||
|
|
@ -266,6 +266,10 @@ class Asset < ApplicationRecord
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def marvinjs?
|
||||||
|
file.metadata[:asset_type] == 'marvinjs'
|
||||||
|
end
|
||||||
|
|
||||||
def post_process_file(team = nil)
|
def post_process_file(team = nil)
|
||||||
# Update self.empty
|
# Update self.empty
|
||||||
update(file_present: true)
|
update(file_present: true)
|
||||||
|
|
@ -277,6 +281,8 @@ class Asset < ApplicationRecord
|
||||||
# estimated size calculation
|
# estimated size calculation
|
||||||
Asset.delay(queue: :assets, run_at: 20.minutes.from_now)
|
Asset.delay(queue: :assets, run_at: 20.minutes.from_now)
|
||||||
.extract_asset_text_delayed(id, in_template)
|
.extract_asset_text_delayed(id, in_template)
|
||||||
|
elsif marvinjs?
|
||||||
|
extract_asset_text
|
||||||
else
|
else
|
||||||
# Update asset's estimated size immediately
|
# Update asset's estimated size immediately
|
||||||
update_estimated_size(team)
|
update_estimated_size(team)
|
||||||
|
|
@ -293,31 +299,36 @@ class Asset < ApplicationRecord
|
||||||
def extract_asset_text(in_template = false)
|
def extract_asset_text(in_template = false)
|
||||||
self.in_template = in_template
|
self.in_template = in_template
|
||||||
|
|
||||||
download_blob_to_tempfile do |tmp_file|
|
if marvinjs?
|
||||||
|
mjs_doc = Nokogiri::XML(file.metadata[:description])
|
||||||
|
mjs_doc.remove_namespaces!
|
||||||
|
text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ')
|
||||||
|
else
|
||||||
# Start Tika as a server
|
# Start Tika as a server
|
||||||
Yomu.server(:text) if !ENV['NO_TIKA_SERVER'] && Yomu.class_variable_get(:@@server_pid).nil?
|
Yomu.server(:text) if !ENV['NO_TIKA_SERVER'] && Yomu.class_variable_get(:@@server_pid).nil?
|
||||||
|
download_blob_to_tempfile do |tmp_file|
|
||||||
text_data = Yomu.new(tmp_file.path).text
|
text_data = Yomu.new(tmp_file.path).text
|
||||||
|
|
||||||
if asset_text_datum.present?
|
|
||||||
# Update existing text datum if it exists
|
|
||||||
asset_text_datum.update(data: text_data)
|
|
||||||
else
|
|
||||||
# Create new text datum
|
|
||||||
AssetTextDatum.create(data: text_data, asset: self)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
Rails.logger.info "Asset #{id}: Asset file successfully extracted"
|
|
||||||
|
|
||||||
# Finally, update asset's estimated size to include
|
|
||||||
# the data vector
|
|
||||||
update_estimated_size(team)
|
|
||||||
rescue StandardError => e
|
|
||||||
Rails.logger.fatal(
|
|
||||||
"Asset #{id}: Error extracting contents from asset "\
|
|
||||||
"file #{file.blob.key}: #{e.message}"
|
|
||||||
)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
if asset_text_datum.present?
|
||||||
|
# Update existing text datum if it exists
|
||||||
|
asset_text_datum.update(data: text_data)
|
||||||
|
else
|
||||||
|
# Create new text datum
|
||||||
|
AssetTextDatum.create(data: text_data, asset: self)
|
||||||
|
end
|
||||||
|
|
||||||
|
Rails.logger.info "Asset #{id}: Asset file successfully extracted"
|
||||||
|
|
||||||
|
# Finally, update asset's estimated size to include
|
||||||
|
# the data vector
|
||||||
|
update_estimated_size(team)
|
||||||
|
rescue StandardError => e
|
||||||
|
Rails.logger.fatal(
|
||||||
|
"Asset #{id}: Error extracting contents from asset "\
|
||||||
|
"file #{file.blob.key}: #{e.message}"
|
||||||
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
# If team is provided, its space_taken
|
# If team is provided, its space_taken
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ class MarvinJsService
|
||||||
team_id: current_team.id)
|
team_id: current_team.id)
|
||||||
attach_file(asset.file, file, params)
|
attach_file(asset.file, file, params)
|
||||||
asset.save!
|
asset.save!
|
||||||
|
asset.post_process_file(current_team)
|
||||||
connect_asset(asset, params, current_user)
|
connect_asset(asset, params, current_user)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
@ -39,6 +40,7 @@ class MarvinJsService
|
||||||
|
|
||||||
file = generate_image(params)
|
file = generate_image(params)
|
||||||
attach_file(attachment, file, params)
|
attach_file(attachment, file, params)
|
||||||
|
asset.post_process_file(current_team)
|
||||||
asset
|
asset
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue