class Regexp::Lexer
A very thin wrapper around the scanner that breaks quantified literal runs, collects emitted tokens into an array, calculates their nesting depth, and normalizes tokens for the parser, and checks if they are implemented by the given syntax flavor.
Constants
- CLOSING_TOKENS
- OPENING_TOKENS
Attributes
conditional_nesting[RW]
nesting[RW]
set_nesting[RW]
shift[RW]
tokens[RW]
Public Class Methods
lex(input, syntax = "ruby/
click to toggle source
# File lib/regexp_parser/lexer.rb, line 14 def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block) new.lex(input, syntax, &block) end
Also aliased as: scan
Public Instance Methods
lex(input, syntax = "ruby/
click to toggle source
# File lib/regexp_parser/lexer.rb, line 18 def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block) syntax = Regexp::Syntax.new(syntax) self.tokens = [] self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 last = nil Regexp::Scanner.scan(input) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if type == :quantifier and last break_literal(last) if last.type == :literal break_codepoint_list(last) if last.token == :codepoint_list end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) current = merge_condition(current) if type == :conditional and [:condition, :condition_close].include?(token) last.next = current if last current.previous = last if last tokens << current last = current descend(type, token) end if block_given? tokens.map { |t| block.call(t) } else tokens end end
Private Instance Methods
ascend(type, token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 69 def ascend(type, token) case type when :group, :assertion self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token) when :set self.set_nesting = set_nesting - 1 if token == :close when :conditional self.conditional_nesting = conditional_nesting - 1 if token == :close end end
break_codepoint_list(token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 106 def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? tokens.pop tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } end
break_literal(token)
click to toggle source
called by scan to break a literal run that is longer than one character into two separate tokens when it is followed by a quantifier
# File lib/regexp_parser/lexer.rb, line 93 def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? tokens.pop tokens << Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.bytesize), nesting, set_nesting, conditional_nesting) tokens << Regexp::Token.new(:literal, :literal, last, (token.ts + lead.bytesize), token.te, nesting, set_nesting, conditional_nesting) end
descend(type, token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 80 def descend(type, token) case type when :group, :assertion self.nesting = nesting + 1 if OPENING_TOKENS.include?(token) when :set self.set_nesting = set_nesting + 1 if token == :open when :conditional self.conditional_nesting = conditional_nesting + 1 if token == :open end end
merge_condition(current)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 121 def merge_condition(current) last = tokens.pop Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) end