diff options
| -rw-r--r-- | CHANGELOG.md | 1 | ||||
| -rw-r--r-- | Gemfile.lock | 11 | ||||
| -rw-r--r-- | lib/spandx.rb | 1 | ||||
| -rw-r--r-- | lib/spandx/content.rb | 38 | ||||
| -rw-r--r-- | lib/spandx/database.rb | 3 | ||||
| -rw-r--r-- | lib/spandx/guess.rb | 39 | ||||
| -rw-r--r-- | lib/spandx/license.rb | 6 | ||||
| -rw-r--r-- | spandx.gemspec | 2 | ||||
| -rw-r--r-- | spec/spec_helper.rb | 2 | ||||
| -rw-r--r-- | spec/unit/guess_spec.rb | 25 |
10 files changed, 105 insertions, 23 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index a14850f..19c4bc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added - Parse .NET `sln` files +- Add ability to choose Levenshtein algorithm ## [0.1.7] - 2020-01-28 ### Added diff --git a/Gemfile.lock b/Gemfile.lock index 659436a..9eaaf68 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -5,6 +5,7 @@ PATH bundler (>= 1.16, < 3.0.0) net-hippie (~> 0.3) nokogiri (~> 1.10) + text (~> 1.3) thor (~> 0.1) GEM @@ -13,6 +14,9 @@ GEM addressable (2.7.0) public_suffix (>= 2.0.2, < 5.0) ast (2.4.0) + benchmark-malloc (0.1.0) + benchmark-perf (0.5.0) + benchmark-trend (0.3.0) bundler-audit (0.6.1) bundler (>= 1.2.0, < 3) thor (~> 0.18) @@ -35,6 +39,11 @@ GEM rspec-core (~> 3.9.0) rspec-expectations (~> 3.9.0) rspec-mocks (~> 3.9.0) + rspec-benchmark (0.5.1) + benchmark-malloc (~> 0.1.0) + benchmark-perf (~> 0.5.0) + benchmark-trend (~> 0.3.0) + rspec (>= 3.0.0, < 4.0.0) rspec-core (3.9.1) rspec-support (~> 3.9.1) rspec-expectations (3.9.0) @@ -55,6 +64,7 @@ GEM rubocop (>= 0.68.1) ruby-progressbar (1.10.1) safe_yaml (1.0.5) + text (1.3.1) thor (0.20.3) unicode-display_width (1.6.1) vcr (5.0.0) @@ -70,6 +80,7 @@ DEPENDENCIES bundler-audit (~> 0.6) rake (~> 13.0) rspec (~> 3.0) + rspec-benchmark (~> 0.5) rubocop (~> 0.52) rubocop-rspec (~> 1.22) spandx! diff --git a/lib/spandx.rb b/lib/spandx.rb index ca6c9ba..4d264d1 100644 --- a/lib/spandx.rb +++ b/lib/spandx.rb @@ -6,6 +6,7 @@ require 'json' require 'net/hippie' require 'nokogiri' require 'pathname' +require 'text' require 'spandx/catalogue' require 'spandx/content' diff --git a/lib/spandx/content.rb b/lib/spandx/content.rb index 380058d..085bc2a 100644 --- a/lib/spandx/content.rb +++ b/lib/spandx/content.rb @@ -2,22 +2,33 @@ module Spandx class Content - attr_reader :tokens, :threshold + attr_reader :raw, :threshold - def initialize(content, threshold: 89.0) + def initialize(raw, threshold: 89.0) @threshold = threshold - @tokens = tokenize(canonicalize(content)).to_set + @raw = raw end - def similar?(other) - similarity_score(other) > threshold + def tokens + @tokens ||= tokenize(canonicalize(raw)).to_set end - # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby - def similarity_score(other) - overlap = (tokens & other.tokens).size - total = tokens.size + other.tokens.size - 100.0 * (overlap * 2.0 / total) + def similar?(other, algorithm: :dice_coefficient) + case algorithm + when :dice_coefficient + similarity_score(other) > threshold + when :levenshtein + similarity_score(other) < threshold + end + end + + def similarity_score(other, algorithm: :dice_coefficient) + case algorithm + when :dice_coefficient + dice_coefficient(other) + when :levenshtein + Text::Levenshtein.distance(raw, other.raw, 100) + end end private @@ -33,5 +44,12 @@ module Spandx def blank?(content) content.nil? || content.chomp.strip.empty? end + + # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient#Ruby + def dice_coefficient(other) + overlap = (tokens & other.tokens).size + total = tokens.size + other.tokens.size + 100.0 * (overlap * 2.0 / total) + end end end diff --git a/lib/spandx/database.rb b/lib/spandx/database.rb index 716aa17..701b64a 100644 --- a/lib/spandx/database.rb +++ b/lib/spandx/database.rb @@ -14,7 +14,8 @@ module Spandx end def read(file) - IO.read(File.join(path, file)) + full_path = File.join(path, file) + IO.read(full_path) if File.exist?(full_path) end private diff --git a/lib/spandx/guess.rb b/lib/spandx/guess.rb index 36994fc..f459ab2 100644 --- a/lib/spandx/guess.rb +++ b/lib/spandx/guess.rb @@ -27,19 +27,42 @@ module Spandx @catalogue = catalogue end - def license_for(raw_content) + def license_for(raw_content, algorithm: :dice_coefficient) content = Content.new(raw_content) + score = nil - max_score = nil - catalogue.each do |license| - next if license.deprecated_license_id? + if algorithm == :dice_coefficient + catalogue.each do |license| + next if license.deprecated_license_id? - percentage = content.similarity_score(license.content) - if (percentage > 89.0) && (max_score.nil? || percentage > max_score.score) - max_score = Score.new(percentage, license) + score = dice(content, license, score) end + elsif algorithm == :levenshtein + catalogue.each do |license| + next if license.deprecated_license_id? + + score = levenshtein(content, license, score) + end + end + score ? score.item.id : nil + end + + private + + def levenshtein(target, other, score) + percentage = target.similarity_score(other.content, algorithm: :levenshtein) + if (score.nil? || percentage < score.score) + return Score.new(percentage, other) + end + score + end + + def dice(target, other, score) + percentage = target.similarity_score(other.content, algorithm: :dice_coefficient) + if (percentage > 89.0) && (score.nil? || percentage > score.score) + return Score.new(percentage, other) end - max_score ? max_score.item.id : nil + score end end end diff --git a/lib/spandx/license.rb b/lib/spandx/license.rb index fd54e6a..f77b5be 100644 --- a/lib/spandx/license.rb +++ b/lib/spandx/license.rb @@ -61,7 +61,11 @@ module Spandx end def content - @content ||= Content.new(Spandx.db.read("text/#{id}.txt")) + @content ||= Content.new(raw_content) + end + + def raw_content + @raw_content ||= (Spandx.db.read("text/#{id}.txt") || '') end def <=>(other) diff --git a/spandx.gemspec b/spandx.gemspec index 9cb278f..74178b3 100644 --- a/spandx.gemspec +++ b/spandx.gemspec @@ -31,12 +31,14 @@ Gem::Specification.new do |spec| spec.required_ruby_version = '>= 2.5.0' spec.add_dependency 'bundler', '>= 1.16', '< 3.0.0' + spec.add_dependency 'text', '~> 1.3' spec.add_dependency 'net-hippie', '~> 0.3' spec.add_dependency 'nokogiri', '~> 1.10' spec.add_dependency 'thor', '~> 0.1' spec.add_development_dependency 'bundler-audit', '~> 0.6' spec.add_development_dependency 'rake', '~> 13.0' spec.add_development_dependency 'rspec', '~> 3.0' + spec.add_development_dependency 'rspec-benchmark', '~> 0.5' spec.add_development_dependency 'rubocop', '~> 0.52' spec.add_development_dependency 'rubocop-rspec', '~> 1.22' spec.add_development_dependency 'vcr', '~> 5.0' diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 096e7a3..dccb16f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -3,6 +3,7 @@ require 'bundler/setup' require 'spandx' require 'spandx/cli' +require 'rspec-benchmark' require 'webmock/rspec' require 'securerandom' Dir['./spec/support/**/*.rb'].sort.each { |f| require f } @@ -10,6 +11,7 @@ Dir['./spec/support/**/*.rb'].sort.each { |f| require f } RSpec.configure do |config| # Enable flags like --only-failures and --next-failure config.example_status_persistence_file_path = '.rspec_status' + config.include RSpec::Benchmark::Matchers config.include(Module.new do def fixture_file(file) File.join(File.dirname(__FILE__), 'fixtures', file) diff --git a/spec/unit/guess_spec.rb b/spec/unit/guess_spec.rb index 2a08d77..0b9b646 100644 --- a/spec/unit/guess_spec.rb +++ b/spec/unit/guess_spec.rb @@ -108,9 +108,28 @@ RSpec.describe Spandx::Guess do pending { expect(subject.license_for(license_file(license))).to eql(license) } end end - end - it 'guesses the spandx license' do - expect(subject.license_for(IO.read('LICENSE.txt'))).to eql('MIT') + context "when guessing the spandx license" do + let(:content) { IO.read('LICENSE.txt') } + + it 'guesses the spandx license using the default algorithm' do + expect(subject.license_for(content)).to eql('MIT') + end + + [ + :dice_coefficient, + :levenshtein + ].each do |algorithm| + context algorithm.to_s do + specify { expect(subject.license_for(content, algorithm: algorithm)).to eql('MIT') } + + specify do + expect do + subject.license_for(content, algorithm: algorithm) + end.to perform_under(0.05).sample(10) + end + end + end + end end end |
