Class | FeedParser::SGMLParser |
In: |
lib/feedparser/sgml-parser.rb
|
Parent: | Object |
Interesting | = | /[&<]/ | Regular expressions used for parsing: | |
Incomplete | = | Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + '![^<>]*)?') | ||
Entityref | = | /&([a-zA-Z][-.a-zA-Z0-9]*);/ | ||
Charref | = | /&#([0-9]+);/ | ||
Starttagopen | = | /<[>a-zA-Z]/ | ||
Endtagopen | = | /<\/[<>a-zA-Z]/ | ||
Endbracket | = | /[<>]/ | ||
Special | = | /<![^<>]*>/ | ||
Commentopen | = | /<!--/ | ||
Commentclose | = | /--[ \t\n]*>/ | ||
Tagfind | = | /[a-zA-Z][a-zA-Z0-9.-]*/ | ||
Attrfind | = | Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + '(\s*=\s*' + "('[^']*'" + '|"[^"]*"' + '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?') | ||
Entitydefs | = | {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} |
# File lib/feedparser/sgml-parser.rb, line 30 30: def initialize(verbose=false) 31: @verbose = verbose 32: reset 33: end
# File lib/feedparser/sgml-parser.rb, line 56 56: def feed(data) 57: @rawdata << data 58: goahead(false) 59: end
# File lib/feedparser/sgml-parser.rb, line 242 242: def finish_endtag(tag) 243: if tag == '' 244: found = @stack.length - 1 245: if found < 0 246: unknown_endtag(tag) 247: return 248: end 249: else 250: unless @stack.include? tag 251: method = 'end_' + tag 252: unless self.respond_to?(method) 253: unknown_endtag(tag) 254: end 255: return 256: end 257: found = @stack.index(tag) #or @stack.length 258: end 259: while @stack.length > found 260: tag = @stack[-1] 261: method = 'end_' + tag 262: if respond_to?(method) 263: handle_endtag(tag, method) 264: else 265: unknown_endtag(tag) 266: end 267: @stack.pop 268: end 269: end
# File lib/feedparser/sgml-parser.rb, line 224 224: def finish_starttag(tag, attrs) 225: method = 'start_' + tag 226: if self.respond_to?(method) 227: @stack << tag 228: handle_starttag(tag, method, attrs) 229: return 1 230: else 231: method = 'do_' + tag 232: if self.respond_to?(method) 233: handle_starttag(tag, method, attrs) 234: return 0 235: else 236: unknown_starttag(tag, attrs) 237: return -1 238: end 239: end 240: end
# File lib/feedparser/sgml-parser.rb, line 65 65: def goahead(_end) 66: rawdata = @rawdata 67: i = 0 68: n = rawdata.length 69: while i < n 70: if @nomoretags 71: handle_data(rawdata[i..(n-1)]) 72: i = n 73: break 74: end 75: j = rawdata.index(Interesting, i) 76: j = n unless j 77: if i < j 78: handle_data(rawdata[i..(j-1)]) 79: end 80: i = j 81: break if (i == n) 82: if rawdata[i] == ?< # 83: if rawdata.index(Starttagopen, i) == i 84: if @literal 85: handle_data(rawdata[i, 1]) 86: i += 1 87: next 88: end 89: k = parse_starttag(i) 90: break unless k 91: i = k 92: next 93: end 94: if rawdata.index(Endtagopen, i) == i 95: k = parse_endtag(i) 96: break unless k 97: i = k 98: @literal = false 99: next 100: end 101: if rawdata.index(Commentopen, i) == i 102: if @literal 103: handle_data(rawdata[i,1]) 104: i += 1 105: next 106: end 107: k = parse_comment(i) 108: break unless k 109: i += k 110: next 111: end 112: if rawdata.index(Special, i) == i 113: if @literal 114: handle_data(rawdata[i, 1]) 115: i += 1 116: next 117: end 118: k = parse_special(i) 119: break unless k 120: i += k 121: next 122: end 123: elsif rawdata[i] == ?& # 124: if rawdata.index(Charref, i) == i 125: i += $&.length 126: handle_charref($1) 127: i -= 1 unless rawdata[i-1] == ?; 128: next 129: end 130: if rawdata.index(Entityref, i) == i 131: i += $&.length 132: handle_entityref($1) 133: i -= 1 unless rawdata[i-1] == ?; 134: next 135: end 136: else 137: raise RuntimeError, 'neither < nor & ??' 138: end 139: # We get here only if incomplete matches but 140: # nothing else 141: match = rawdata.index(Incomplete, i) 142: unless match == i 143: handle_data(rawdata[i, 1]) 144: i += 1 145: next 146: end 147: j = match + $&.length 148: break if j == n # Really incomplete 149: handle_data(rawdata[i..(j-1)]) 150: i = j 151: end 152: # end while 153: if _end and i < n 154: handle_data(@rawdata[i..(n-1)]) 155: i = n 156: end 157: @rawdata = rawdata[i..-1] 158: end
# File lib/feedparser/sgml-parser.rb, line 295 295: def handle_charref(name) 296: n = name.to_i 297: if !(0 <= n && n <= 255) 298: unknown_charref(name) 299: return 300: end 301: handle_data(n.chr) 302: end
# File lib/feedparser/sgml-parser.rb, line 284 284: def handle_endtag(tag, method) 285: self.send(method) 286: end
# File lib/feedparser/sgml-parser.rb, line 304 304: def handle_entityref(name) 305: table = Entitydefs 306: if table.include?(name) 307: handle_data(table[name]) 308: else 309: unknown_entityref(name) 310: return 311: end 312: end
# File lib/feedparser/sgml-parser.rb, line 280 280: def handle_starttag(tag, method, attrs) 281: self.send(method, attrs) 282: end
# File lib/feedparser/sgml-parser.rb, line 43 43: def has_context(gi) 44: @stack.include? gi 45: end
# File lib/feedparser/sgml-parser.rb, line 160 160: def parse_comment(i) 161: rawdata = @rawdata 162: if rawdata[i, 4] != '<!--' 163: raise RuntimeError, 'unexpected call to handle_comment' 164: end 165: match = rawdata.index(Commentclose, i) 166: return nil unless match 167: matched_length = $&.length 168: j = match 169: handle_comment(rawdata[i+4..(j-1)]) 170: j = match + matched_length 171: return j-i 172: end
# File lib/feedparser/sgml-parser.rb, line 212 212: def parse_endtag(i) 213: rawdata = @rawdata 214: j = rawdata.index(Endbracket, i + 1) 215: return nil unless j 216: tag = (rawdata[i+2..j-1].strip).downcase 217: if rawdata[j] == ?> # 218: j += 1 219: end 220: finish_endtag(tag) 221: return j 222: end
# File lib/feedparser/sgml-parser.rb, line 271 271: def parse_special(i) 272: rawdata = @rawdata 273: match = rawdata.index(Endbracket, i+1) 274: return nil unless match 275: matched_length = $&.length 276: handle_special(rawdata[i+1..(match-1)]) 277: return match - i + matched_length 278: end
# File lib/feedparser/sgml-parser.rb, line 174 174: def parse_starttag(i) 175: rawdata = @rawdata 176: j = rawdata.index(Endbracket, i + 1) 177: return nil unless j 178: attrs = [] 179: if rawdata[i+1] == ?> # 180: # SGML shorthand: <> == <last open tag seen> 181: k = j 182: tag = @lasttag 183: else 184: match = rawdata.index(Tagfind, i + 1) 185: unless match 186: raise RuntimeError, 'unexpected call to parse_starttag' 187: end 188: k = i + 1 + ($&.length) 189: tag = $&.downcase 190: @lasttag = tag 191: end 192: while k < j 193: break unless rawdata.index(Attrfind, k) 194: matched_length = $&.length 195: attrname, rest, attrvalue = $1, $2, $3 196: if not rest 197: attrvalue = '' # was: = attrname 198: elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or 199: (attrvalue[0] == ?" && attrvalue[-1,1] == ?") 200: attrvalue = attrvalue[1..-2] 201: end 202: attrs << [attrname.downcase, attrvalue] 203: k += matched_length 204: end 205: if rawdata[j] == ?> # 206: j += 1 207: end 208: finish_starttag(tag, attrs) 209: return j 210: end
# File lib/feedparser/sgml-parser.rb, line 288 288: def report_unbalanced(tag) 289: if @verbose 290: print '*** Unbalanced </' + tag + '>', "\n" 291: print '*** Stack:', self.stack, "\n" 292: end 293: end
# File lib/feedparser/sgml-parser.rb, line 35 35: def reset 36: @rawdata = '' 37: @stack = [] 38: @lasttag = '???' 39: @nomoretags = false 40: @literal = false 41: end
# File lib/feedparser/sgml-parser.rb, line 47 47: def setnomoretags 48: @nomoretags = true 49: @literal = true 50: end