summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md1
-rw-r--r--Gemfile.lock11
-rw-r--r--lib/spandx.rb1
-rw-r--r--lib/spandx/content.rb38
-rw-r--r--lib/spandx/database.rb3
-rw-r--r--lib/spandx/guess.rb39
-rw-r--r--lib/spandx/license.rb6
-rw-r--r--spandx.gemspec2
-rw-r--r--spec/spec_helper.rb2
-rw-r--r--spec/unit/guess_spec.rb25
10 files changed, 105 insertions, 23 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a14850f..19c4bc4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- Parse .NET `sln` files
+- Add ability to choose Levenshtein algorithm
## [0.1.7] - 2020-01-28
### Added
diff --git a/Gemfile.lock b/Gemfile.lock
index 659436a..9eaaf68 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -5,6 +5,7 @@ PATH
bundler (>= 1.16, < 3.0.0)
net-hippie (~> 0.3)
nokogiri (~> 1.10)
+ text (~> 1.3)
thor (~> 0.1)
GEM
@@ -13,6 +14,9 @@ GEM
addressable (2.7.0)
public_suffix (>= 2.0.2, < 5.0)
ast (2.4.0)
+ benchmark-malloc (0.1.0)
+ benchmark-perf (0.5.0)
+ benchmark-trend (0.3.0)
bundler-audit (0.6.1)
bundler (>= 1.2.0, < 3)
thor (~> 0.18)
@@ -35,6 +39,11 @@ GEM
rspec-core (~> 3.9.0)
rspec-expectations (~> 3.9.0)
rspec-mocks (~> 3.9.0)
+ rspec-benchmark (0.5.1)
+ benchmark-malloc (~> 0.1.0)
+ benchmark-perf (~> 0.5.0)
+ benchmark-trend (~> 0.3.0)
+ rspec (>= 3.0.0, < 4.0.0)
rspec-core (3.9.1)
rspec-support (~> 3.9.1)
rspec-expectations (3.9.0)
@@ -55,6 +64,7 @@ GEM
rubocop (>= 0.68.1)
ruby-progressbar (1.10.1)
safe_yaml (1.0.5)
+ text (1.3.1)
thor (0.20.3)
unicode-display_width (1.6.1)
vcr (5.0.0)
@@ -70,6 +80,7 @@ DEPENDENCIES
bundler-audit (~> 0.6)
rake (~> 13.0)
rspec (~> 3.0)
+ rspec-benchmark (~> 0.5)
rubocop (~> 0.52)
rubocop-rspec (~> 1.22)
spandx!
diff --git a/lib/spandx.rb b/lib/spandx.rb
index ca6c9ba..4d264d1 100644
--- a/lib/spandx.rb
+++ b/lib/spandx.rb
@@ -6,6 +6,7 @@ require 'json'
require 'net/hippie'
require 'nokogiri'
require 'pathname'
+require 'text'
require 'spandx/catalogue'
require 'spandx/content'
diff --git a/lib/spandx/content.rb b/lib/spandx/content.rb
index 380058d..085bc2a 100644
--- a/lib/spandx/content.rb
+++ b/lib/spandx/content.rb
@@ -2,22 +2,33 @@
module Spandx
class Content
- attr_reader :tokens, :threshold
+ attr_reader :raw, :threshold
- def initialize(content, threshold: 89.0)
+ def initialize(raw, threshold: 89.0)
@threshold = threshold
- @tokens = tokenize(canonicalize(content)).to_set
+ @raw = raw
end
- def similar?(other)
- similarity_score(other) > threshold
+ def tokens
+ @tokens ||= tokenize(canonicalize(raw)).to_set
end
- # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby
- def similarity_score(other)
- overlap = (tokens & other.tokens).size
- total = tokens.size + other.tokens.size
- 100.0 * (overlap * 2.0 / total)
+ def similar?(other, algorithm: :dice_coefficient)
+ case algorithm
+ when :dice_coefficient
+ similarity_score(other) > threshold
+ when :levenshtein
+ similarity_score(other) < threshold
+ end
+ end
+
+ def similarity_score(other, algorithm: :dice_coefficient)
+ case algorithm
+ when :dice_coefficient
+ dice_coefficient(other)
+ when :levenshtein
+ Text::Levenshtein.distance(raw, other.raw, 100)
+ end
end
private
@@ -33,5 +44,12 @@ module Spandx
def blank?(content)
content.nil? || content.chomp.strip.empty?
end
+
+ # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby
+ def dice_coefficient(other)
+ overlap = (tokens & other.tokens).size
+ total = tokens.size + other.tokens.size
+ 100.0 * (overlap * 2.0 / total)
+ end
end
end
diff --git a/lib/spandx/database.rb b/lib/spandx/database.rb
index 716aa17..701b64a 100644
--- a/lib/spandx/database.rb
+++ b/lib/spandx/database.rb
@@ -14,7 +14,8 @@ module Spandx
end
def read(file)
- IO.read(File.join(path, file))
+ full_path = File.join(path, file)
+ IO.read(full_path) if File.exist?(full_path)
end
private
diff --git a/lib/spandx/guess.rb b/lib/spandx/guess.rb
index 36994fc..f459ab2 100644
--- a/lib/spandx/guess.rb
+++ b/lib/spandx/guess.rb
@@ -27,19 +27,42 @@ module Spandx
@catalogue = catalogue
end
- def license_for(raw_content)
+ def license_for(raw_content, algorithm: :dice_coefficient)
content = Content.new(raw_content)
+ score = nil
- max_score = nil
- catalogue.each do |license|
- next if license.deprecated_license_id?
+ if algorithm == :dice_coefficient
+ catalogue.each do |license|
+ next if license.deprecated_license_id?
- percentage = content.similarity_score(license.content)
- if (percentage > 89.0) && (max_score.nil? || percentage > max_score.score)
- max_score = Score.new(percentage, license)
+ score = dice(content, license, score)
end
+ elsif algorithm == :levenshtein
+ catalogue.each do |license|
+ next if license.deprecated_license_id?
+
+ score = levenshtein(content, license, score)
+ end
+ end
+ score ? score.item.id : nil
+ end
+
+ private
+
+ def levenshtein(target, other, score)
+ percentage = target.similarity_score(other.content, algorithm: :levenshtein)
+ if (score.nil? || percentage < score.score)
+ return Score.new(percentage, other)
+ end
+ score
+ end
+
+ def dice(target, other, score)
+ percentage = target.similarity_score(other.content, algorithm: :dice_coefficient)
+ if (percentage > 89.0) && (score.nil? || percentage > score.score)
+ return Score.new(percentage, other)
end
- max_score ? max_score.item.id : nil
+ score
end
end
end
diff --git a/lib/spandx/license.rb b/lib/spandx/license.rb
index fd54e6a..f77b5be 100644
--- a/lib/spandx/license.rb
+++ b/lib/spandx/license.rb
@@ -61,7 +61,11 @@ module Spandx
end
def content
- @content ||= Content.new(Spandx.db.read("text/#{id}.txt"))
+ @content ||= Content.new(raw_content)
+ end
+
+ def raw_content
+ @raw_content ||= (Spandx.db.read("text/#{id}.txt") || '')
end
def <=>(other)
diff --git a/spandx.gemspec b/spandx.gemspec
index 9cb278f..74178b3 100644
--- a/spandx.gemspec
+++ b/spandx.gemspec
@@ -31,12 +31,14 @@ Gem::Specification.new do |spec|
spec.required_ruby_version = '>= 2.5.0'
spec.add_dependency 'bundler', '>= 1.16', '< 3.0.0'
+ spec.add_dependency 'text', '~> 1.3'
spec.add_dependency 'net-hippie', '~> 0.3'
spec.add_dependency 'nokogiri', '~> 1.10'
spec.add_dependency 'thor', '~> 0.1'
spec.add_development_dependency 'bundler-audit', '~> 0.6'
spec.add_development_dependency 'rake', '~> 13.0'
spec.add_development_dependency 'rspec', '~> 3.0'
+ spec.add_development_dependency 'rspec-benchmark', '~> 0.5'
spec.add_development_dependency 'rubocop', '~> 0.52'
spec.add_development_dependency 'rubocop-rspec', '~> 1.22'
spec.add_development_dependency 'vcr', '~> 5.0'
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
index 096e7a3..dccb16f 100644
--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@@ -3,6 +3,7 @@
require 'bundler/setup'
require 'spandx'
require 'spandx/cli'
+require 'rspec-benchmark'
require 'webmock/rspec'
require 'securerandom'
Dir['./spec/support/**/*.rb'].sort.each { |f| require f }
@@ -10,6 +11,7 @@ Dir['./spec/support/**/*.rb'].sort.each { |f| require f }
RSpec.configure do |config|
# Enable flags like --only-failures and --next-failure
config.example_status_persistence_file_path = '.rspec_status'
+ config.include RSpec::Benchmark::Matchers
config.include(Module.new do
def fixture_file(file)
File.join(File.dirname(__FILE__), 'fixtures', file)
diff --git a/spec/unit/guess_spec.rb b/spec/unit/guess_spec.rb
index 2a08d77..0b9b646 100644
--- a/spec/unit/guess_spec.rb
+++ b/spec/unit/guess_spec.rb
@@ -108,9 +108,28 @@ RSpec.describe Spandx::Guess do
pending { expect(subject.license_for(license_file(license))).to eql(license) }
end
end
- end
- it 'guesses the spandx license' do
- expect(subject.license_for(IO.read('LICENSE.txt'))).to eql('MIT')
+ context "when guessing the spandx license" do
+ let(:content) { IO.read('LICENSE.txt') }
+
+ it 'guesses the spandx license using the default algorithm' do
+ expect(subject.license_for(content)).to eql('MIT')
+ end
+
+ [
+ :dice_coefficient,
+ :levenshtein
+ ].each do |algorithm|
+ context algorithm.to_s do
+ specify { expect(subject.license_for(content, algorithm: algorithm)).to eql('MIT') }
+
+ specify do
+ expect do
+ subject.license_for(content, algorithm: algorithm)
+ end.to perform_under(0.05).sample(10)
+ end
+ end
+ end
+ end
end
end