summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/spandx/core/content.rb26
-rw-r--r--lib/spandx/core/guess.rb48
-rw-r--r--spandx.gemspec2
-rw-r--r--spec/unit/core/content_spec.rb20
-rw-r--r--spec/unit/core/guess_spec.rb11
5 files changed, 27 insertions, 80 deletions
diff --git a/lib/spandx/core/content.rb b/lib/spandx/core/content.rb
index 2132c2a..4cd73c0 100644
--- a/lib/spandx/core/content.rb
+++ b/lib/spandx/core/content.rb
@@ -13,30 +13,12 @@ module Spandx
@tokens ||= tokenize(canonicalize(raw)).to_set
end
- def similar?(other, algorithm: :dice_coefficient)
- case algorithm
- when :dice_coefficient
- similarity_score(other, algorithm: algorithm) > 89.0
- when :levenshtein
- similarity_score(other, algorithm: algorithm) < 3
- when :jaro_winkler
- similarity_score(other, algorithm: algorithm) > 89.0
- end
+ def similar?(other)
+ similarity_score(other) > 89.0
end
- def similarity_score(other, algorithm: :dice_coefficient)
- case algorithm
- when :dice_coefficient
- dice_coefficient(other)
- when :levenshtein
- require 'text'
-
- Text::Levenshtein.distance(raw, other.raw, 100)
- when :jaro_winkler
- require 'jaro_winkler'
-
- JaroWinkler.distance(raw, other.raw) * 100.0
- end
+ def similarity_score(other)
+ dice_coefficient(other)
end
private
diff --git a/lib/spandx/core/guess.rb b/lib/spandx/core/guess.rb
index fa25064..f59ab7a 100644
--- a/lib/spandx/core/guess.rb
+++ b/lib/spandx/core/guess.rb
@@ -10,34 +10,34 @@ module Spandx
@name_search = FuzzyMatch.new(catalogue, read: :name)
end
- def license_for(raw, algorithm: :dice_coefficient)
- raw.is_a?(Hash) ? from_hash(raw, algorithm) : from_string(raw, algorithm)
+ def license_for(raw)
+ raw.is_a?(Hash) ? from_hash(raw) : from_string(raw)
end
private
- def from_hash(hash, algorithm)
- from_string(hash[:name], algorithm) ||
- from_url(hash[:url], algorithm) ||
+ def from_hash(hash)
+ from_string(hash[:name]) ||
+ from_url(hash[:url]) ||
unknown(hash[:name] || hash[:url])
end
- def from_string(raw, algorithm)
+ def from_string(raw)
content = Content.new(raw)
catalogue[raw] ||
match_name(content) ||
- match_body(content, algorithm) ||
+ match_body(content) ||
unknown(raw)
end
- def from_url(url, algorithm)
+ def from_url(url)
return if url.nil? || url.empty?
response = Spandx.http.get(url)
return unless Spandx.http.ok?(response)
- license_for(response.body, algorithm: algorithm)
+ license_for(response.body)
end
def match_name(content)
@@ -46,13 +46,13 @@ module Spandx
@name_search.find(content.raw)
end
- def match_body(content, algorithm)
+ def match_body(content)
score = Score.new(nil, nil)
- threshold = threshold_for(algorithm)
- direction = algorithm == :levenshtein ? method(:min) : method(:max)
-
+ threshold = 89.0
catalogue.each do |license|
- direction.call(content, license, score, threshold, algorithm) unless license.deprecated_license_id?
+ next if license.deprecated_license_id?
+
+ max(content, license, score, threshold)
end
score&.item
end
@@ -61,24 +61,8 @@ module Spandx
::Spandx::Spdx::License.unknown(text)
end
- def threshold_for(algorithm)
- {
- dice_coefficient: 89.0,
- jaro_winkler: 80.0,
- levenshtein: 80.0,
- }[algorithm.to_sym]
- end
-
- def min(target, other, score, threshold, algorithm)
- percentage = target.similarity_score(other.content, algorithm: algorithm)
- return if percentage > threshold
- return if score.score > 0.0 && score.score < percentage
-
- score.update(percentage, other)
- end
-
- def max(target, other, score, threshold, algorithm)
- percentage = target.similarity_score(other.content, algorithm: algorithm)
+ def max(target, other, score, threshold)
+ percentage = target.similarity_score(other.content)
return if percentage < threshold
return if score.score >= percentage
diff --git a/spandx.gemspec b/spandx.gemspec
index 85bc8db..9ed5c50 100644
--- a/spandx.gemspec
+++ b/spandx.gemspec
@@ -40,7 +40,6 @@ Gem::Specification.new do |spec|
spec.add_development_dependency 'bundler-audit', '~> 0.6'
spec.add_development_dependency 'byebug', '~> 11.1'
- spec.add_development_dependency 'jaro_winkler', '~> 1.5'
spec.add_development_dependency 'licensed', '~> 2.8'
spec.add_development_dependency 'parallel_tests', '~> 2.32'
spec.add_development_dependency 'rake', '~> 13.0'
@@ -48,7 +47,6 @@ Gem::Specification.new do |spec|
spec.add_development_dependency 'rspec-benchmark', '~> 0.5'
spec.add_development_dependency 'rubocop', '~> 0.52'
spec.add_development_dependency 'rubocop-rspec', '~> 1.22'
- spec.add_development_dependency 'text', '~> 1.3'
spec.add_development_dependency 'vcr', '~> 5.0'
spec.add_development_dependency 'webmock', '~> 3.7'
end
diff --git a/spec/unit/core/content_spec.rb b/spec/unit/core/content_spec.rb
index 4b01dc4..7a633fc 100644
--- a/spec/unit/core/content_spec.rb
+++ b/spec/unit/core/content_spec.rb
@@ -17,19 +17,13 @@ RSpec.describe Spandx::Core::Content do
let(:mit) { described_class.new(license_file('MIT')) }
let(:lgpl) { described_class.new(license_file('LGPL-2.0-only')) }
- [
- :dice_coefficient,
- :jaro_winkler,
- #:levenshtein,
- ].each do |algorithm|
- specify { expect(subject).to be_similar(mit, algorithm: algorithm) }
- specify { expect(subject).not_to be_similar(lgpl, algorithm: algorithm) }
- specify { expect(subject).to be_similar(subject, algorithm: algorithm) }
- specify { expect(text('hello world')).to be_similar(text('hello world'), algorithm: algorithm) }
- specify { expect(text('hello world')).not_to be_similar(text('goodbye world'), algorithm: algorithm) }
- specify { expect(text('hello world')).not_to be_similar(text('goodbye universe'), algorithm: algorithm) }
- specify { expect(text('a b c')).not_to be_similar(text('b c d'), algorithm: algorithm) }
- end
+ specify { expect(subject).to be_similar(mit) }
+ specify { expect(subject).not_to be_similar(lgpl) }
+ specify { expect(subject).to be_similar(subject) }
+ specify { expect(text('hello world')).to be_similar(text('hello world')) }
+ specify { expect(text('hello world')).not_to be_similar(text('goodbye world')) }
+ specify { expect(text('hello world')).not_to be_similar(text('goodbye universe')) }
+ specify { expect(text('a b c')).not_to be_similar(text('b c d')) }
end
describe '#similarity_score' do
diff --git a/spec/unit/core/guess_spec.rb b/spec/unit/core/guess_spec.rb
index 2e267d4..4d5364d 100644
--- a/spec/unit/core/guess_spec.rb
+++ b/spec/unit/core/guess_spec.rb
@@ -47,17 +47,6 @@ RSpec.describe Spandx::Core::Guess do
let!(:content) { IO.read('LICENSE.txt') }
specify { expect(subject.license_for(content)&.id).to eql('MIT') }
- specify { expect(subject.license_for(content, algorithm: :dice_coefficient)&.id).to eql('MIT') }
- specify { expect(subject.license_for(content, algorithm: :levenshtein)&.id).to eql('MIT') }
- specify { expect(subject.license_for(content, algorithm: :jaro_winkler)&.id).to eql('MIT') }
-
- %i[dice_coefficient levenshtein jaro_winkler].each do |algorithm|
- pending algorithm do
- expect do
- subject.license_for(content, algorithm: algorithm)
- end.to perform_under(0.01).sample(10)
- end
- end
end
end
end