Feature #306 » 0001-implements-fulltext-extraction-for-attachments.patch
| Gemfile | ||
|---|---|---|
| 25 | 25 |
gem 'tzinfo-data', platforms: [:mingw, :x64_mingw, :mswin] |
| 26 | 26 |
gem "rbpdf", "~> 1.19.6" |
| 27 | 27 | |
| 28 |
gem 'plaintext' |
|
| 29 | ||
| 28 | 30 |
# Optional gem for LDAP authentication |
| 29 | 31 |
group :ldap do |
| 30 | 32 |
gem "net-ldap", "~> 0.16.0" |
| config/configuration.yml.example | ||
|---|---|---|
| 209 | 209 |
# allowed values: :memory, :file, :memcache |
| 210 | 210 |
#openid_authentication_store: :memory |
| 211 | 211 | |
| 212 |
# Text extraction helper programs. |
|
| 213 |
# |
|
| 214 |
# commands should write the resulting plain text to STDOUT. Use __FILE__ as |
|
| 215 |
# placeholder for the file path. The values below are the defaults. |
|
| 216 |
# |
|
| 217 |
# To disable a certain extractor without having to remove it from your |
|
| 218 |
# system, set it's command to a non-existant binary, i.e: |
|
| 219 |
# |
|
| 220 |
# pdftotext: |
|
| 221 |
# - /usr/bin/pdftotext_disabled |
|
| 222 |
# |
|
| 223 |
text_extractors: |
|
| 224 |
# apt install poppler-utils |
|
| 225 |
# pdftotext: |
|
| 226 |
# - /usr/bin/pdftotext |
|
| 227 |
# - -enc |
|
| 228 |
# - UTF-8 |
|
| 229 |
# - __FILE__ |
|
| 230 |
# - '-' |
|
| 231 | ||
| 232 |
# apt install unrtf |
|
| 233 |
# unrtf: |
|
| 234 |
# - /usr/bin/unrtf |
|
| 235 |
# - --text |
|
| 236 |
# - __FILE__ |
|
| 237 | ||
| 238 |
# apt install catdoc |
|
| 239 |
# catdoc: |
|
| 240 |
# - /usr/bin/catdoc |
|
| 241 |
# - -dutf-8 |
|
| 242 |
# - __FILE__ |
|
| 243 |
# xls2csv: |
|
| 244 |
# - /usr/bin/xls2csv |
|
| 245 |
# - -dutf-8 |
|
| 246 |
# - __FILE__ |
|
| 247 |
# catppt: |
|
| 248 |
# - /usr/bin/catppt |
|
| 249 |
# - -dutf-8 |
|
| 250 |
# - __FILE__ |
|
| 251 | ||
| 252 |
# apt-get install tesseract-ocr |
|
| 253 |
# tesseract: |
|
| 254 |
# - /usr/bin/tesseract |
|
| 255 |
# - -dutf-8 |
|
| 256 |
# - __FILE__ |
|
| 257 | ||
| 212 | 258 |
# specific configuration options for production environment |
| 213 | 259 |
# that overrides the default ones |
| 214 | 260 |
production: |
| lib/redmine/configuration.rb | ||
|---|---|---|
| 65 | 65 |
end |
| 66 | 66 |
end |
| 67 | 67 | |
| 68 |
if text_extractors = @config['text_extractors'] |
|
| 69 |
Plaintext::Configuration.load YAML.dump text_extractors |
|
| 70 |
end |
|
| 71 | ||
| 68 | 72 |
check_regular_expressions |
| 69 | 73 |
@config |
| 70 | 74 |
end |
| lib/redmine/text_extractor.rb | ||
|---|---|---|
| 1 |
module Redmine |
|
| 2 |
class TextExtractor |
|
| 3 | ||
| 4 |
def initialize(attachment) |
|
| 5 |
@attachment = attachment |
|
| 6 |
end |
|
| 7 | ||
| 8 |
# returns the extracted fulltext or nil if no matching handler was found |
|
| 9 |
# for the file type. |
|
| 10 |
def text |
|
| 11 |
Plaintext::Resolver.new(@attachment.diskfile, |
|
| 12 |
@attachment.content_type).text |
|
| 13 |
rescue Exception => e |
|
| 14 |
Rails.logger.error "error in fulltext extraction: #{e}"
|
|
| 15 |
raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc |
|
| 16 |
end |
|
| 17 | ||
| 18 |
end |
|
| 19 |
end |
|
| 20 | ||
| test/unit/lib/redmine/text_extractor_test.rb | ||
|---|---|---|
| 1 |
require_relative '../../../test_helper' |
|
| 2 | ||
| 3 |
class Redmine::TextExtractorTest < ActiveSupport::TestCase |
|
| 4 |
fixtures :projects, :users, :attachments |
|
| 5 | ||
| 6 |
setup do |
|
| 7 |
@project = Project.find_by_identifier 'ecookbook' |
|
| 8 |
set_fixtures_attachments_directory |
|
| 9 |
@dlopper = User.find_by_login 'dlopper' |
|
| 10 |
end |
|
| 11 | ||
| 12 |
def attachment_for(filename, content_type = nil) |
|
| 13 |
Attachment.new(container: @project, |
|
| 14 |
file: uploaded_test_file(filename, content_type), |
|
| 15 |
filename: filename, |
|
| 16 |
author: @dlopper).tap do |a| |
|
| 17 |
a.content_type = content_type if content_type |
|
| 18 |
a.save! |
|
| 19 |
end |
|
| 20 |
end |
|
| 21 | ||
| 22 |
test "should extract text from text file" do |
|
| 23 |
a = attachment_for "testfile.txt" |
|
| 24 |
te = Redmine::TextExtractor.new a |
|
| 25 |
assert text = te.text |
|
| 26 |
assert_match /this is a text file for upload tests with multiple lines/, text |
|
| 27 |
end |
|
| 28 | ||
| 29 |
test "should extract text from csv" do |
|
| 30 |
a = attachment_for "import_dates.csv" |
|
| 31 |
te = Redmine::TextExtractor.new a |
|
| 32 |
assert text = te.text |
|
| 33 |
assert_match /Invalid start date/, text |
|
| 34 |
end |
|
| 35 | ||
| 36 |
end |
|
| 37 | ||