summaryrefslogtreecommitdiff
path: root/assignments/4/tokenizer.rb
blob: 0bd2e2d87f16be7b90163878c2e6cb32a593f400 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#!/usr/bin/env ruby

WHITESPACE = Regexp.new('[[:blank:]]+')
SPLITTABLES = [';', '(', ')']

def tokenize(code)
  pattern = Regexp.new("[^#{Regexp.escape(SPLITTABLES.join)}]+")
  output = []
  tokens = code.chomp.strip.split(WHITESPACE)
  tokens.each do |token|
    prefix, stem, suffix = token.partition(pattern)
    output << prefix.split('') unless prefix.empty?
    output << stem unless stem.empty?
    output << suffix.split('') unless suffix.empty?
  end

  output.flatten
end


code = ARGV[0]
tokens = tokenize(code)

puts "Input: #{code.inspect}"
puts "Tokens: #{tokens.inspect}"