Project

General

Profile

Feature #306 » 0001-implements-fulltext-extraction-for-attachments.patch

[new] adds text extractor using the plaintext gem - Jens Krämer, 2018-10-26 07:35

View differences:

Gemfile
25 25
gem 'tzinfo-data', platforms: [:mingw, :x64_mingw, :mswin]
26 26
gem "rbpdf", "~> 1.19.6"
27 27

  
28
gem 'plaintext'
29

  
28 30
# Optional gem for LDAP authentication
29 31
group :ldap do
30 32
  gem "net-ldap", "~> 0.16.0"
config/configuration.yml.example
209 209
  # allowed values: :memory, :file, :memcache
210 210
  #openid_authentication_store: :memory
211 211

  
212
  # Text extraction helper programs.
213
  #
214
  # commands should write the resulting plain text to STDOUT. Use __FILE__ as
215
  # placeholder for the file path. The values below are the defaults.
216
  #
217
  # To disable a certain extractor without having to remove it from your
218
  # system, set it's command to a non-existant binary, i.e:
219
  #
220
  #   pdftotext:
221
  #     - /usr/bin/pdftotext_disabled
222
  #
223
  text_extractors:
224
    # apt install poppler-utils
225
    # pdftotext:
226
    #   - /usr/bin/pdftotext
227
    #   - -enc
228
    #   - UTF-8
229
    #   - __FILE__
230
    #   - '-'
231

  
232
    # apt install unrtf
233
    # unrtf:
234
    #   - /usr/bin/unrtf
235
    #   - --text
236
    #   - __FILE__
237

  
238
    # apt install catdoc
239
    # catdoc:
240
    #   - /usr/bin/catdoc
241
    #   - -dutf-8
242
    #   - __FILE__
243
    # xls2csv:
244
    #   - /usr/bin/xls2csv
245
    #   - -dutf-8
246
    #   - __FILE__
247
    # catppt:
248
    #   - /usr/bin/catppt
249
    #   - -dutf-8
250
    #   - __FILE__
251

  
252
    # apt-get install tesseract-ocr
253
    # tesseract:
254
    #   - /usr/bin/tesseract
255
    #   - -dutf-8
256
    #   - __FILE__
257

  
212 258
# specific configuration options for production environment
213 259
# that overrides the default ones
214 260
production:
lib/redmine/configuration.rb
65 65
          end
66 66
        end
67 67

  
68
        if text_extractors = @config['text_extractors']
69
          Plaintext::Configuration.load YAML.dump text_extractors
70
        end
71

  
68 72
        check_regular_expressions
69 73
        @config
70 74
      end
lib/redmine/text_extractor.rb
1
module Redmine
2
  class TextExtractor
3

  
4
    def initialize(attachment)
5
      @attachment = attachment
6
    end
7

  
8
    # returns the extracted fulltext or nil if no matching handler was found
9
    # for the file type.
10
    def text
11
      Plaintext::Resolver.new(@attachment.diskfile,
12
                              @attachment.content_type).text
13
    rescue Exception => e
14
      Rails.logger.error "error in fulltext extraction: #{e}"
15
      raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc
16
    end
17

  
18
  end
19
end
20

  
test/unit/lib/redmine/text_extractor_test.rb
1
require_relative '../../../test_helper'
2

  
3
class Redmine::TextExtractorTest < ActiveSupport::TestCase
4
  fixtures :projects, :users, :attachments
5

  
6
  setup do
7
    @project = Project.find_by_identifier 'ecookbook'
8
    set_fixtures_attachments_directory
9
    @dlopper = User.find_by_login 'dlopper'
10
  end
11

  
12
  def attachment_for(filename, content_type = nil)
13
    Attachment.new(container: @project,
14
                   file: uploaded_test_file(filename, content_type),
15
                   filename: filename,
16
                   author: @dlopper).tap do |a|
17
      a.content_type = content_type if content_type
18
      a.save!
19
    end
20
  end
21

  
22
  test "should extract text from text file" do
23
    a = attachment_for "testfile.txt"
24
    te = Redmine::TextExtractor.new a
25
    assert text = te.text
26
    assert_match /this is a text file for upload tests with multiple lines/, text
27
  end
28

  
29
  test "should extract text from csv" do
30
    a = attachment_for "import_dates.csv"
31
    te = Redmine::TextExtractor.new a
32
    assert text = te.text
33
    assert_match /Invalid start date/, text
34
  end
35

  
36
end
37

  
(7-7/11)