diff options
| -rw-r--r-- | lib/spandx/core/content.rb | 26 | ||||
| -rw-r--r-- | lib/spandx/core/guess.rb | 48 | ||||
| -rw-r--r-- | spandx.gemspec | 2 | ||||
| -rw-r--r-- | spec/unit/core/content_spec.rb | 20 | ||||
| -rw-r--r-- | spec/unit/core/guess_spec.rb | 11 |
5 files changed, 27 insertions, 80 deletions
diff --git a/lib/spandx/core/content.rb b/lib/spandx/core/content.rb index 2132c2a..4cd73c0 100644 --- a/lib/spandx/core/content.rb +++ b/lib/spandx/core/content.rb @@ -13,30 +13,12 @@ module Spandx @tokens ||= tokenize(canonicalize(raw)).to_set end - def similar?(other, algorithm: :dice_coefficient) - case algorithm - when :dice_coefficient - similarity_score(other, algorithm: algorithm) > 89.0 - when :levenshtein - similarity_score(other, algorithm: algorithm) < 3 - when :jaro_winkler - similarity_score(other, algorithm: algorithm) > 89.0 - end + def similar?(other) + similarity_score(other) > 89.0 end - def similarity_score(other, algorithm: :dice_coefficient) - case algorithm - when :dice_coefficient - dice_coefficient(other) - when :levenshtein - require 'text' - - Text::Levenshtein.distance(raw, other.raw, 100) - when :jaro_winkler - require 'jaro_winkler' - - JaroWinkler.distance(raw, other.raw) * 100.0 - end + def similarity_score(other) + dice_coefficient(other) end private diff --git a/lib/spandx/core/guess.rb b/lib/spandx/core/guess.rb index fa25064..f59ab7a 100644 --- a/lib/spandx/core/guess.rb +++ b/lib/spandx/core/guess.rb @@ -10,34 +10,34 @@ module Spandx @name_search = FuzzyMatch.new(catalogue, read: :name) end - def license_for(raw, algorithm: :dice_coefficient) - raw.is_a?(Hash) ? from_hash(raw, algorithm) : from_string(raw, algorithm) + def license_for(raw) + raw.is_a?(Hash) ? from_hash(raw) : from_string(raw) end private - def from_hash(hash, algorithm) - from_string(hash[:name], algorithm) || - from_url(hash[:url], algorithm) || + def from_hash(hash) + from_string(hash[:name]) || + from_url(hash[:url]) || unknown(hash[:name] || hash[:url]) end - def from_string(raw, algorithm) + def from_string(raw) content = Content.new(raw) catalogue[raw] || match_name(content) || - match_body(content, algorithm) || + match_body(content) || unknown(raw) end - def from_url(url, algorithm) + def from_url(url) return if url.nil? || url.empty? response = Spandx.http.get(url) return unless Spandx.http.ok?(response) - license_for(response.body, algorithm: algorithm) + license_for(response.body) end def match_name(content) @@ -46,13 +46,13 @@ module Spandx @name_search.find(content.raw) end - def match_body(content, algorithm) + def match_body(content) score = Score.new(nil, nil) - threshold = threshold_for(algorithm) - direction = algorithm == :levenshtein ? method(:min) : method(:max) - + threshold = 89.0 catalogue.each do |license| - direction.call(content, license, score, threshold, algorithm) unless license.deprecated_license_id? + next if license.deprecated_license_id? + + max(content, license, score, threshold) end score&.item end @@ -61,24 +61,8 @@ module Spandx ::Spandx::Spdx::License.unknown(text) end - def threshold_for(algorithm) - { - dice_coefficient: 89.0, - jaro_winkler: 80.0, - levenshtein: 80.0, - }[algorithm.to_sym] - end - - def min(target, other, score, threshold, algorithm) - percentage = target.similarity_score(other.content, algorithm: algorithm) - return if percentage > threshold - return if score.score > 0.0 && score.score < percentage - - score.update(percentage, other) - end - - def max(target, other, score, threshold, algorithm) - percentage = target.similarity_score(other.content, algorithm: algorithm) + def max(target, other, score, threshold) + percentage = target.similarity_score(other.content) return if percentage < threshold return if score.score >= percentage diff --git a/spandx.gemspec b/spandx.gemspec index 85bc8db..9ed5c50 100644 --- a/spandx.gemspec +++ b/spandx.gemspec @@ -40,7 +40,6 @@ Gem::Specification.new do |spec| spec.add_development_dependency 'bundler-audit', '~> 0.6' spec.add_development_dependency 'byebug', '~> 11.1' - spec.add_development_dependency 'jaro_winkler', '~> 1.5' spec.add_development_dependency 'licensed', '~> 2.8' spec.add_development_dependency 'parallel_tests', '~> 2.32' spec.add_development_dependency 'rake', '~> 13.0' @@ -48,7 +47,6 @@ Gem::Specification.new do |spec| spec.add_development_dependency 'rspec-benchmark', '~> 0.5' spec.add_development_dependency 'rubocop', '~> 0.52' spec.add_development_dependency 'rubocop-rspec', '~> 1.22' - spec.add_development_dependency 'text', '~> 1.3' spec.add_development_dependency 'vcr', '~> 5.0' spec.add_development_dependency 'webmock', '~> 3.7' end diff --git a/spec/unit/core/content_spec.rb b/spec/unit/core/content_spec.rb index 4b01dc4..7a633fc 100644 --- a/spec/unit/core/content_spec.rb +++ b/spec/unit/core/content_spec.rb @@ -17,19 +17,13 @@ RSpec.describe Spandx::Core::Content do let(:mit) { described_class.new(license_file('MIT')) } let(:lgpl) { described_class.new(license_file('LGPL-2.0-only')) } - [ - :dice_coefficient, - :jaro_winkler, - #:levenshtein, - ].each do |algorithm| - specify { expect(subject).to be_similar(mit, algorithm: algorithm) } - specify { expect(subject).not_to be_similar(lgpl, algorithm: algorithm) } - specify { expect(subject).to be_similar(subject, algorithm: algorithm) } - specify { expect(text('hello world')).to be_similar(text('hello world'), algorithm: algorithm) } - specify { expect(text('hello world')).not_to be_similar(text('goodbye world'), algorithm: algorithm) } - specify { expect(text('hello world')).not_to be_similar(text('goodbye universe'), algorithm: algorithm) } - specify { expect(text('a b c')).not_to be_similar(text('b c d'), algorithm: algorithm) } - end + specify { expect(subject).to be_similar(mit) } + specify { expect(subject).not_to be_similar(lgpl) } + specify { expect(subject).to be_similar(subject) } + specify { expect(text('hello world')).to be_similar(text('hello world')) } + specify { expect(text('hello world')).not_to be_similar(text('goodbye world')) } + specify { expect(text('hello world')).not_to be_similar(text('goodbye universe')) } + specify { expect(text('a b c')).not_to be_similar(text('b c d')) } end describe '#similarity_score' do diff --git a/spec/unit/core/guess_spec.rb b/spec/unit/core/guess_spec.rb index 2e267d4..4d5364d 100644 --- a/spec/unit/core/guess_spec.rb +++ b/spec/unit/core/guess_spec.rb @@ -47,17 +47,6 @@ RSpec.describe Spandx::Core::Guess do let!(:content) { IO.read('LICENSE.txt') } specify { expect(subject.license_for(content)&.id).to eql('MIT') } - specify { expect(subject.license_for(content, algorithm: :dice_coefficient)&.id).to eql('MIT') } - specify { expect(subject.license_for(content, algorithm: :levenshtein)&.id).to eql('MIT') } - specify { expect(subject.license_for(content, algorithm: :jaro_winkler)&.id).to eql('MIT') } - - %i[dice_coefficient levenshtein jaro_winkler].each do |algorithm| - pending algorithm do - expect do - subject.license_for(content, algorithm: algorithm) - end.to perform_under(0.01).sample(10) - end - end end end end |
