mirror of
https://github.com/scinote-eln/scinote-web.git
synced 2025-09-14 17:14:54 +08:00
Add size limits for file text extraction and preview generation, remove unused data column from asset_text_data [SCI-11717] (#8369)
This commit is contained in:
parent
bd980fcea4
commit
085cd8dea6
11 changed files with 28 additions and 38 deletions
|
@ -147,11 +147,7 @@ class AssetsController < ApplicationController
|
|||
end
|
||||
|
||||
def show
|
||||
if @asset
|
||||
render json: @asset, serializer: AssetSerializer, user: current_user
|
||||
else
|
||||
render json: { error: 'Asset not found' }, status: :not_found
|
||||
end
|
||||
render json: @asset, serializer: AssetSerializer, user: current_user
|
||||
end
|
||||
|
||||
def edit
|
||||
|
|
|
@ -225,8 +225,8 @@ class Asset < ApplicationRecord
|
|||
|
||||
def pdf_preview_ready?
|
||||
return false if pdf_preview_processing
|
||||
|
||||
return true if file_pdf_preview.attached?
|
||||
return false unless previewable_document?(blob)
|
||||
|
||||
PdfPreviewJob.perform_later(id)
|
||||
ActiveRecord::Base.no_touching { update(pdf_preview_processing: true) }
|
||||
|
@ -259,7 +259,6 @@ class Asset < ApplicationRecord
|
|||
es = file_size
|
||||
if asset_text_datum.present? && asset_text_datum.persisted?
|
||||
asset_text_datum.reload
|
||||
es += get_octet_length_record(asset_text_datum, :data)
|
||||
es += get_octet_length_record(asset_text_datum, :data_vector)
|
||||
end
|
||||
es *= Constants::ASSET_ESTIMATED_SIZE_FACTOR
|
||||
|
|
|
@ -3,18 +3,6 @@
|
|||
class AssetTextDatum < ApplicationRecord
|
||||
include SearchableModel
|
||||
|
||||
validates :data, presence: true
|
||||
validates :asset, presence: true, uniqueness: true
|
||||
validates :asset, uniqueness: true
|
||||
belongs_to :asset, inverse_of: :asset_text_datum
|
||||
|
||||
after_save :update_ts_index
|
||||
|
||||
def update_ts_index
|
||||
if saved_change_to_data?
|
||||
sql = "UPDATE asset_text_data " +
|
||||
"SET data_vector = to_tsvector(data) " +
|
||||
"WHERE id = " + Integer(id).to_s
|
||||
AssetTextDatum.connection.execute(sql)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -90,11 +90,11 @@ class AssetSerializer < ActiveModel::Serializer
|
|||
end
|
||||
|
||||
def pdf
|
||||
return unless object.pdf?
|
||||
return unless object.pdf? || object.file_pdf_preview.attached?
|
||||
|
||||
{
|
||||
url: object.pdf? ? asset_download_path(object) : asset_pdf_preview_path(object),
|
||||
size: !object.pdf? && object.pdf_preview_ready? ? object.file_pdf_preview&.blob&.byte_size : object.file_size,
|
||||
size: object.pdf? ? object.file_size : object.file_pdf_preview&.blob&.byte_size,
|
||||
worker_url: ActionController::Base.helpers.asset_path('pdf_js_worker.js')
|
||||
}
|
||||
end
|
||||
|
|
|
@ -4,6 +4,7 @@ module ActiveStorageFileUtil
|
|||
# Method expects instance of ActiveStorage::Blob as argument
|
||||
def previewable_document?(blob)
|
||||
return false if blob.blank?
|
||||
return false if blob.byte_size > Constants::PREVIEW_MAX_FILE_SIZE
|
||||
|
||||
previewable = Constants::PREVIEWABLE_FILE_TYPES.include?(blob.content_type)
|
||||
|
||||
|
|
|
@ -328,11 +328,15 @@ class Constants
|
|||
'text/plain'
|
||||
].freeze
|
||||
|
||||
TEXT_EXTRACT_MAX_FILE_SIZE = ENV['TEXT_EXTRACT_MAX_FILE_SIZE_MB'] ? ENV['TEXT_EXTRACT_MAX_FILE_SIZE_MB'].to_i : 50.megabytes
|
||||
|
||||
PREVIEWABLE_FILE_TYPES = TEXT_EXTRACT_FILE_TYPES
|
||||
|
||||
# default preview timeout to 15 minutes
|
||||
PREVIEW_TIMEOUT_SECONDS = ENV['PREVIEW_TIMEOUT_SECONDS'] ? ENV['PREVIEW_TIMEOUT_SECONDS'].to_i : 900
|
||||
|
||||
PREVIEW_MAX_FILE_SIZE = ENV['PREVIEW_MAX_FILE_SIZE_MB'] ? ENV['PREVIEW_MAX_FILE_SIZE_MB'].to_i : 50.megabytes
|
||||
|
||||
WHITELISTED_IMAGE_TYPES = [
|
||||
'gif', 'jpeg', 'pjpeg', 'png', 'x-png', 'svg+xml', 'bmp', 'tiff', 'jpg'
|
||||
].freeze
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
class RemoveDataColumnFromAssetTextData < ActiveRecord::Migration[7.0]
|
||||
def change
|
||||
remove_column :asset_text_data, :data, :text
|
||||
end
|
||||
end
|
|
@ -90,7 +90,6 @@ ActiveRecord::Schema[7.0].define(version: 2025_03_25_124848) do
|
|||
end
|
||||
|
||||
create_table "asset_text_data", force: :cascade do |t|
|
||||
t.text "data", null: false
|
||||
t.bigint "asset_id", null: false
|
||||
t.datetime "created_at", precision: nil, null: false
|
||||
t.datetime "updated_at", precision: nil, null: false
|
||||
|
|
|
@ -5,7 +5,9 @@ module ActiveStorage
|
|||
DEFAULT_TIKA_PATH = 'tika-app.jar'
|
||||
|
||||
def self.accept?(blob)
|
||||
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any?
|
||||
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) &&
|
||||
blob.byte_size <= Constants::TEXT_EXTRACT_MAX_FILE_SIZE &&
|
||||
blob.attachments.where(record_type: 'Asset').any?
|
||||
end
|
||||
|
||||
def self.analyze_later?
|
||||
|
@ -50,14 +52,15 @@ module ActiveStorage
|
|||
def create_or_update_text_data(text_data)
|
||||
@blob.attachments.where(record_type: 'Asset').each do |attachemnt|
|
||||
asset = attachemnt.record
|
||||
if asset.asset_text_datum.present?
|
||||
# Update existing text datum if it exists
|
||||
asset.asset_text_datum.update!(data: text_data)
|
||||
else
|
||||
# Create new text datum
|
||||
asset.create_asset_text_datum!(data: text_data)
|
||||
end
|
||||
asset.create_asset_text_datum! if asset.asset_text_datum.blank?
|
||||
sql = ActiveRecord::Base.sanitize_sql_array(
|
||||
[
|
||||
'UPDATE "asset_text_data" SET "data_vector" = to_tsvector(:text_data) WHERE "id" = :id',
|
||||
{ text_data: text_data, id: asset.asset_text_datum.id }
|
||||
]
|
||||
)
|
||||
|
||||
AssetTextDatum.connection.execute(sql)
|
||||
asset.update_estimated_size
|
||||
|
||||
Rails.logger.info "Asset #{asset.id}: file text successfully extracted"
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
FactoryBot.define do
|
||||
factory :asset_text_datum do
|
||||
data { "Sample name\tSample type\n" + "sample6\tsample\n" + "\n" }
|
||||
asset
|
||||
end
|
||||
end
|
||||
|
|
|
@ -15,7 +15,6 @@ describe AssetTextDatum, type: :model do
|
|||
|
||||
describe 'Database table' do
|
||||
it { should have_db_column :id }
|
||||
it { should have_db_column :data }
|
||||
it { should have_db_column :asset_id }
|
||||
it { should have_db_column :created_at }
|
||||
it { should have_db_column :data_vector }
|
||||
|
@ -26,12 +25,7 @@ describe AssetTextDatum, type: :model do
|
|||
end
|
||||
|
||||
describe 'Validations' do
|
||||
describe '#data' do
|
||||
it { is_expected.to validate_presence_of(:data) }
|
||||
end
|
||||
|
||||
describe '#asset' do
|
||||
it { is_expected.to validate_presence_of(:asset) }
|
||||
it { expect(asset_text_datum).to validate_uniqueness_of(:asset) }
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Add table
Reference in a new issue