mirror of
				https://github.com/scinote-eln/scinote-web.git
				synced 2025-10-31 00:19:20 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			74 lines
		
	
	
	
		
			2.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			74 lines
		
	
	
	
		
			2.3 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| # frozen_string_literal: true
 | |
| 
 | |
| module ActiveStorage
 | |
|   class Analyzer::TextExtractionAnalyzer < Analyzer
 | |
|     DEFAULT_TIKA_PATH = 'tika-app.jar'
 | |
| 
 | |
|     def self.accept?(blob)
 | |
|       blob.content_type.in?(Constants::TEXT_EXTRACT_FILE_TYPES) &&
 | |
|         blob.byte_size <= Constants::TEXT_EXTRACT_MAX_FILE_SIZE &&
 | |
|         blob.attachments.where(record_type: 'Asset').any?
 | |
|     end
 | |
| 
 | |
|     def self.analyze_later?
 | |
|       true
 | |
|     end
 | |
| 
 | |
|     def metadata
 | |
|       download_blob_to_tempfile do |file|
 | |
|         if blob.content_type == 'application/pdf'
 | |
|           process_pdf(file)
 | |
|         elsif blob.metadata[:asset_type] == 'marvinjs'
 | |
|           process_marvinjs(file)
 | |
|         else
 | |
|           process_other(file)
 | |
|         end
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     private
 | |
| 
 | |
|     def process_pdf(file)
 | |
|       text_data = IO.popen(['pdftotext', file.path, '-'], 'r').read
 | |
|       create_or_update_text_data(text_data)
 | |
|     rescue Errno::ENOENT
 | |
|       logger.info "pdftotext isn't installed, falling back to default text extraction method"
 | |
|       process_other(file)
 | |
|     end
 | |
| 
 | |
|     def process_marvinjs(file)
 | |
|       mjs_doc = Nokogiri::XML(file.metadata[:description])
 | |
|       mjs_doc.remove_namespaces!
 | |
|       text_data = mjs_doc.search("//Field[@name='text']").collect(&:text).join(' ')
 | |
|       create_or_update_text_data(text_data)
 | |
|     end
 | |
| 
 | |
|     def process_other(file)
 | |
|       tika_path = ENV['TIKA_PATH'] || DEFAULT_TIKA_PATH
 | |
|       text_data = IO.popen(['java', '-Djava.awt.headless=true', '-jar', tika_path, '-t', file.path], 'r').read
 | |
|       create_or_update_text_data(text_data)
 | |
|     end
 | |
| 
 | |
|     def create_or_update_text_data(text_data)
 | |
|       @blob.attachments.where(record_type: 'Asset').each do |attachemnt|
 | |
|         asset = attachemnt.record
 | |
|         asset.create_asset_text_datum! if asset.asset_text_datum.blank?
 | |
|         sql = ActiveRecord::Base.sanitize_sql_array(
 | |
|           [
 | |
|             'UPDATE "asset_text_data" SET "data_vector" = to_tsvector(:text_data) WHERE "id" = :id',
 | |
|             { text_data: text_data, id: asset.asset_text_datum.id }
 | |
|           ]
 | |
|         )
 | |
| 
 | |
|         AssetTextDatum.connection.execute(sql)
 | |
|         asset.update_estimated_size
 | |
| 
 | |
|         Rails.logger.info "Asset #{asset.id}: file text successfully extracted"
 | |
|       rescue ActiveRecord::RecordInvalid => e
 | |
|         Rails.logger.error "Asset #{asset.id}: file text unsuccessfully extracted with error #{e.message}"
 | |
|       end
 | |
| 
 | |
|       { text_extracted: true }
 | |
|     end
 | |
|   end
 | |
| end
 |