Add size limits for file text extraction and preview generation, remove unused data column from asset_text_data [SCI-11717] (#8369)

This commit is contained in:
Alex Kriuchykhin 2025-04-07 14:27:55 +02:00 committed by GitHub
parent bd980fcea4
commit 085cd8dea6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 28 additions and 38 deletions

View file

@ -147,11 +147,7 @@ class AssetsController < ApplicationController
end
def show
if @asset
render json: @asset, serializer: AssetSerializer, user: current_user
else
render json: { error: 'Asset not found' }, status: :not_found
end
render json: @asset, serializer: AssetSerializer, user: current_user
end
def edit

View file

@ -225,8 +225,8 @@ class Asset < ApplicationRecord
def pdf_preview_ready?
return false if pdf_preview_processing
return true if file_pdf_preview.attached?
return false unless previewable_document?(blob)
PdfPreviewJob.perform_later(id)
ActiveRecord::Base.no_touching { update(pdf_preview_processing: true) }
@ -259,7 +259,6 @@ class Asset < ApplicationRecord
es = file_size
if asset_text_datum.present? && asset_text_datum.persisted?
asset_text_datum.reload
es += get_octet_length_record(asset_text_datum, :data)
es += get_octet_length_record(asset_text_datum, :data_vector)
end
es *= Constants::ASSET_ESTIMATED_SIZE_FACTOR

View file

@ -3,18 +3,6 @@
class AssetTextDatum < ApplicationRecord
include SearchableModel
validates :data, presence: true
validates :asset, presence: true, uniqueness: true
validates :asset, uniqueness: true
belongs_to :asset, inverse_of: :asset_text_datum
after_save :update_ts_index
def update_ts_index
if saved_change_to_data?
sql = "UPDATE asset_text_data " +
"SET data_vector = to_tsvector(data) " +
"WHERE id = " + Integer(id).to_s
AssetTextDatum.connection.execute(sql)
end
end
end

View file

@ -90,11 +90,11 @@ class AssetSerializer < ActiveModel::Serializer
end
def pdf
return unless object.pdf?
return unless object.pdf? || object.file_pdf_preview.attached?
{
url: object.pdf? ? asset_download_path(object) : asset_pdf_preview_path(object),
size: !object.pdf? && object.pdf_preview_ready? ? object.file_pdf_preview&.blob&.byte_size : object.file_size,
size: object.pdf? ? object.file_size : object.file_pdf_preview&.blob&.byte_size,
worker_url: ActionController::Base.helpers.asset_path('pdf_js_worker.js')
}
end

View file

@ -4,6 +4,7 @@ module ActiveStorageFileUtil
# Method expects instance of ActiveStorage::Blob as argument
def previewable_document?(blob)
return false if blob.blank?
return false if blob.byte_size > Constants::PREVIEW_MAX_FILE_SIZE
previewable = Constants::PREVIEWABLE_FILE_TYPES.include?(blob.content_type)

View file

@ -328,11 +328,15 @@ class Constants
'text/plain'
].freeze
TEXT_EXTRACT_MAX_FILE_SIZE = ENV['TEXT_EXTRACT_MAX_FILE_SIZE_MB'] ? ENV['TEXT_EXTRACT_MAX_FILE_SIZE_MB'].to_i : 50.megabytes
PREVIEWABLE_FILE_TYPES = TEXT_EXTRACT_FILE_TYPES
# default preview timeout to 15 minutes
PREVIEW_TIMEOUT_SECONDS = ENV['PREVIEW_TIMEOUT_SECONDS'] ? ENV['PREVIEW_TIMEOUT_SECONDS'].to_i : 900
PREVIEW_MAX_FILE_SIZE = ENV['PREVIEW_MAX_FILE_SIZE_MB'] ? ENV['PREVIEW_MAX_FILE_SIZE_MB'].to_i : 50.megabytes
WHITELISTED_IMAGE_TYPES = [
'gif', 'jpeg', 'pjpeg', 'png', 'x-png', 'svg+xml', 'bmp', 'tiff', 'jpg'
].freeze

View file

@ -0,0 +1,7 @@
# frozen_string_literal: true
class RemoveDataColumnFromAssetTextData < ActiveRecord::Migration[7.0]
def change
remove_column :asset_text_data, :data, :text
end
end

View file

@ -90,7 +90,6 @@ ActiveRecord::Schema[7.0].define(version: 2025_03_25_124848) do
end
create_table "asset_text_data", force: :cascade do |t|
t.text "data", null: false
t.bigint "asset_id", null: false
t.datetime "created_at", precision: nil, null: false
t.datetime "updated_at", precision: nil, null: false

View file

@ -5,7 +5,9 @@ module ActiveStorage
DEFAULT_TIKA_PATH = 'tika-app.jar'
def self.accept?(blob)
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) && blob.attachments.where(record_type: 'Asset').any?
blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) &&
blob.byte_size <= Constants::TEXT_EXTRACT_MAX_FILE_SIZE &&
blob.attachments.where(record_type: 'Asset').any?
end
def self.analyze_later?
@ -50,14 +52,15 @@ module ActiveStorage
def create_or_update_text_data(text_data)
@blob.attachments.where(record_type: 'Asset').each do |attachemnt|
asset = attachemnt.record
if asset.asset_text_datum.present?
# Update existing text datum if it exists
asset.asset_text_datum.update!(data: text_data)
else
# Create new text datum
asset.create_asset_text_datum!(data: text_data)
end
asset.create_asset_text_datum! if asset.asset_text_datum.blank?
sql = ActiveRecord::Base.sanitize_sql_array(
[
'UPDATE "asset_text_data" SET "data_vector" = to_tsvector(:text_data) WHERE "id" = :id',
{ text_data: text_data, id: asset.asset_text_datum.id }
]
)
AssetTextDatum.connection.execute(sql)
asset.update_estimated_size
Rails.logger.info "Asset #{asset.id}: file text successfully extracted"

View file

@ -2,7 +2,6 @@
FactoryBot.define do
factory :asset_text_datum do
data { "Sample name\tSample type\n" + "sample6\tsample\n" + "\n" }
asset
end
end

View file

@ -15,7 +15,6 @@ describe AssetTextDatum, type: :model do
describe 'Database table' do
it { should have_db_column :id }
it { should have_db_column :data }
it { should have_db_column :asset_id }
it { should have_db_column :created_at }
it { should have_db_column :data_vector }
@ -26,12 +25,7 @@ describe AssetTextDatum, type: :model do
end
describe 'Validations' do
describe '#data' do
it { is_expected.to validate_presence_of(:data) }
end
describe '#asset' do
it { is_expected.to validate_presence_of(:asset) }
it { expect(asset_text_datum).to validate_uniqueness_of(:asset) }
end
end