Parent

Class/Module Index [+]

Quicksearch

HTML5::HTMLTokenizer

This class takes care of tokenizing HTML.

Attributes

content_model_flag[RW]
current_token[RW]
stream[R]

Public Class Methods

new(stream, options = {}) click to toggle source

XXX need to fix documentation

# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 26
def initialize(stream, options = {})
  @stream = HTMLInputStream.new(stream, options)

  # Setup the initial tokenizer state
  @content_model_flag = :PCDATA
  @state              = :data_state
  @escapeFlag         = false
  @lastFourChars      = []

  # The current token being created
  @current_token = nil

  # Tokens to be processed.
  @token_queue             = []
  @lowercase_element_name = options[:lowercase_element_name] != false
  @lowercase_attr_name    = options[:lowercase_attr_name]    != false
end

Public Instance Methods

after_attribute_name_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 491
def after_attribute_name_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
    @stream.chars_until(SPACE_CHARACTERS, true)
  elsif data == "="
    @state = :before_attribute_value_state
  elsif data == ">"
    emit_current_token
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
    emit_current_token
  elsif ASCII_LETTERS.include? data
    @current_token[:data].push([data, ""])
    @state = :attribute_name_state
  elsif data == "/"
    process_solidus_in_tag
    @state = :before_attribute_name_state
  else
    @current_token[:data].push([data, ""])
    @state = :attribute_name_state
  end
  return true
end
after_doctype_name_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 762
def after_doctype_name_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
  elsif data == ">"
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    @current_token[:correct] = false
    @stream.unget(data)
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @token_queue << @current_token
    @state = :data_state
  else
    char_stack = [data]  
    5.times { char_stack << stream.char }
    token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
    if token == "public" and !char_stack.include?(:EOF)
      @state = :before_doctype_public_identifier_state
    elsif token == "system" and !char_stack.include?(:EOF)
      @state = :before_doctype_system_identifier_state
    else
      @stream.unget(char_stack)
      @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
      @state = :bogus_doctype_state
    end
  end
  return true
end
after_doctype_public_identifier_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 849
def after_doctype_public_identifier_state
  data = @stream.char
  if SPACE_CHARACTERS.include?(data)
  elsif data == "\""
    @current_token[:systemId] = ""
    @state = :doctype_system_identifier_double_quoted_state
  elsif data == "'"
    @current_token[:systemId] = ""
    @state = :doctype_system_identifier_single_quoted_state
  elsif data == ">"
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @state = :bogus_doctype_state
  end
  return true
end
after_doctype_system_identifier_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 929
def after_doctype_system_identifier_state
  data = @stream.char
  if SPACE_CHARACTERS.include?(data)
  elsif data == ">"
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @state = :bogus_doctype_state
  end
  return true
end
attribute_name_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 444
def attribute_name_state
  data = @stream.char
  leavingThisState = true
  emitToken = false
  if data == "="
    @state = :before_attribute_value_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
    @state = :data_state
    emitToken = true
  elsif ASCII_LETTERS.include? data
    @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
    leavingThisState = false
  elsif data == ">"
    # XXX If we emit here the attributes are converted to a dict
    # without being checked and when the code below runs we error
    # because data is a dict not a list
    emitToken = true
  elsif SPACE_CHARACTERS.include? data
    @state = :after_attribute_name_state
  elsif data == "/"
    process_solidus_in_tag
    @state = :before_attribute_name_state
  else
    @current_token[:data][-1][0] += data
    leavingThisState = false
  end

  if leavingThisState
    # Attributes are not dropped at this stage. That happens when the
    # start tag token is emitted so values can still be safely appended
    # to attributes, but we do want to report the parse error in time.
    if @lowercase_attr_name
        @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
    end
    @current_token[:data][0...-1].each {|name,value|
      if @current_token[:data].last.first == name
        @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
        break # don't report an error more than once
      end
    }
    # XXX Fix for above XXX
    emit_current_token if emitToken
  end
  return true
end
attribute_value_double_quoted_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 538
def attribute_value_double_quoted_state
  data = @stream.char
  if data == "\""
    @state = :before_attribute_name_state
  elsif data == "&"
    process_entity_in_attribute
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
    emit_current_token
  else
    @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
  end
  return true
end
attribute_value_single_quoted_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 553
def attribute_value_single_quoted_state
  data = @stream.char
  if data == "'"
    @state = :before_attribute_name_state
  elsif data == "&"
    process_entity_in_attribute
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
    emit_current_token
  else
    @current_token[:data][-1][1] += data +           @stream.chars_until(["'", "&"])
  end
  return true
end
attribute_value_unquoted_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 569
def attribute_value_unquoted_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
    @state = :before_attribute_name_state
  elsif data == "&"
    process_entity_in_attribute
  elsif data == ">"
    emit_current_token
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
    emit_current_token
  else
    @current_token[:data][-1][1] += data +  @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
  end
  return true
end
before_attribute_name_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 423
def before_attribute_name_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
    @stream.chars_until(SPACE_CHARACTERS, true)
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
    emit_current_token
  elsif ASCII_LETTERS.include? data
    @current_token[:data].push([data, ""])
    @state = :attribute_name_state
  elsif data == ">"
    emit_current_token
  elsif data == "/"
    process_solidus_in_tag
  else
    @current_token[:data].push([data, ""])
    @state = :attribute_name_state
  end
  return true
end
before_attribute_value_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 515
def before_attribute_value_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
    @stream.chars_until(SPACE_CHARACTERS, true)
  elsif data == "\""
    @state = :attribute_value_double_quoted_state
  elsif data == "&"
    @state = :attribute_value_unquoted_state
    @stream.unget(data);
  elsif data == "'"
    @state = :attribute_value_single_quoted_state
  elsif data == ">"
    emit_current_token
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
    emit_current_token
  else
    @current_token[:data][-1][1] += data
    @state = :attribute_value_unquoted_state
  end
  return true
end
before_doctype_name_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 723
def before_doctype_name_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
  elsif data == ">"
    @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:name] = data
    @state = :doctype_name_state
  end
  return true
end
before_doctype_public_identifier_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 791
def before_doctype_public_identifier_state
  data = @stream.char

  if SPACE_CHARACTERS.include?(data)
  elsif data == "\""
    @current_token[:publicId] = ""
    @state = :doctype_public_identifier_double_quoted_state
  elsif data == "'"
    @current_token[:publicId] = ""
    @state = :doctype_public_identifier_single_quoted_state
  elsif data == ">"
    @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
    @state = :bogus_doctype_state
  end

  return true
end
before_doctype_system_identifier_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 873
def before_doctype_system_identifier_state
  data = @stream.char
  if SPACE_CHARACTERS.include?(data)
  elsif data == "\""
    @current_token[:systemId] = ""
    @state = :doctype_system_identifier_double_quoted_state
  elsif data == "'"
    @current_token[:systemId] = ""
    @state = :doctype_system_identifier_single_quoted_state
  elsif data == ">"
    @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
    @state = :bogus_doctype_state
  end
  return true
end
bogus_comment_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 586
def bogus_comment_state
  # Make a new comment token and give it as value all the characters
  # until the first > or :EOF (chars_until checks for :EOF automatically)
  # and emit it.
  @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}

  # Eat the character directly after the bogus comment which is either a
  # ">" or an :EOF.
  @stream.char
  @state = :data_state
  return true
end
bogus_doctype_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 947
def bogus_doctype_state
  data = @stream.char
  @current_token[:correct] = false
  if data == ">"
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    # XXX EMIT
    @stream.unget(data)
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  end
  return true
end
close_tag_open_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 343
def close_tag_open_state
  if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
    if @current_token
      char_stack = []

      # So far we know that "</" has been consumed. We now need to know
      # whether the next few characters match the name of last emitted
      # start tag which also happens to be the current_token. We also need
      # to have the character directly after the characters that could
      # match the start tag name.
      (@current_token[:name].length + 1).times do
        char_stack.push(@stream.char)
        # Make sure we don't get hit by :EOF
        break if char_stack[-1] == :EOF
      end

      # Since this is just for checking. We put the characters back on
      # the stack.
      @stream.unget(char_stack)
    end

    if @current_token and
      @current_token[:name].downcase == 
      char_stack[0...-1].join('').downcase and
      (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
      # Because the characters are correct we can safely switch to
      # PCDATA mode now. This also means we don't have to do it when
      # emitting the end tag token.
      @content_model_flag = :PCDATA
    else
      @token_queue << {:type => :Characters, :data => "</"}
      @state = :data_state

      # Need to return here since we don't want the rest of the
      # method to be walked through.
      return true
    end
  end

  data = @stream.char
  if data == :EOF
    @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
    @token_queue << {:type => :Characters, :data => "</"}
    @state = :data_state
  elsif ASCII_LETTERS.include? data
    @current_token = {:type => :EndTag, :name => data, :data => []}
    @state = :tag_name_state
  elsif data == ">"
    @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
    @state = :data_state
  else
    # XXX data can be _'_...
    @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
    @stream.unget(data)
    @state = :bogus_comment_state
  end

  return true
end
comment_end_dash_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 671
def comment_end_dash_state
  data = @stream.char
  if data == "-"
    @state = :comment_end_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:data] += "-" + data +           @stream.chars_until("-")
    # Consume the next character which is either a "-" or an :EOF as
    # well so if there's a "-" directly after the "-" we go nicely to
    # the "comment end state" without emitting a ParseError there.
    @stream.char
  end
  return true
end
comment_end_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 690
def comment_end_state
  data = @stream.char
  if data == ">"
    @token_queue << @current_token
    @state = :data_state
  elsif data == "-"
    @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
    @current_token[:data] += data
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
    @token_queue << @current_token
    @state = :data_state
  else
    # XXX
    @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
    @current_token[:data] += "--" + data
    @state = :comment_state
  end
  return true
end
comment_start_dash_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 638
def comment_start_dash_state
    data = @stream.char
    if data == "-"
        @state = :comment_end_state
    elsif data == ">"
        @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
        @token_queue << @current_token
        @state = :data_state
    elsif data == :EOF
        @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
        @token_queue << @current_token
        @state = :data_state
    else
        @current_token[:data] += '-' + data + @stream.chars_until("-")
        @state = :comment_state
    end
    return true
end
comment_start_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 619
def comment_start_state
    data = @stream.char
    if data == "-"
        @state = :comment_start_dash_state
    elsif data == ">"
        @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
        @token_queue << @current_token
        @state = :data_state
    elsif data == :EOF
        @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
        @token_queue << @current_token
        @state = :data_state
    else
        @current_token[:data] += data + @stream.chars_until("-")
        @state = :comment_state
    end
    return true
end
comment_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 657
def comment_state
  data = @stream.char
  if data == "-"
    @state = :comment_end_dash_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:data] += data + @stream.chars_until("-")
  end
  return true
end
consume_entity(from_attribute=false) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 137
def consume_entity(from_attribute=false)
  char = nil
  char_stack = [@stream.char]
  if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
    @stream.unget(char_stack)
  elsif char_stack[0] == '#'
    # We might have a number entity here.
    char_stack += [@stream.char, @stream.char]
    if char_stack[0 .. 1].include? :EOF
      # If we reach the end of the file put everything up to :EOF
      # back in the queue
      char_stack = char_stack[0...char_stack.index(:EOF)]
      @stream.unget(char_stack)
      @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
    else
      if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
        # Hexadecimal entity detected.
        @stream.unget(char_stack[2])
        char = consume_number_entity(true)
      elsif DIGITS.include? char_stack[1]
        # Decimal entity detected.
        @stream.unget(char_stack[1..-1])
        char = consume_number_entity(false)
      else
        # No number entity detected.
        @stream.unget(char_stack)
        @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
      end
    end
  else
    # At this point in the process might have named entity. Entities
    # are stored in the global variable "entities".
    #
    # Consume characters and compare to these to a substring of the
    # entity names in the list until the substring no longer matches.
    filteredEntityList = ENTITIES.keys
    filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
    entityName = nil

    # Try to find the longest entity the string will match to take care
    # of &noti for instance.
    while char_stack.last != :EOF
      name = char_stack.join('')
      if filteredEntityList.any? {|e| e[0...name.length] == name}
        filteredEntityList.reject! {|e| e[0...name.length] != name}
        char_stack.push(@stream.char)
      else
        break
      end

      if ENTITIES.include? name
        entityName = name
        break if entityName[-1] == ';'
      end
    end

    if entityName != nil
      char = ENTITIES[entityName]

      # Check whether or not the last character returned can be
      # discarded or needs to be put back.
      if entityName[-1] != ;;
        @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
      end

      if entityName[-1] != ";" and from_attribute and
         (ASCII_LETTERS.include?(char_stack[entityName.length]) or
          DIGITS.include?(char_stack[entityName.length]))
        @stream.unget(char_stack)
        char = '&'
      else
        @stream.unget(char_stack[entityName.length..-1])
      end
    else
      @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
      @stream.unget(char_stack)
    end
  end
  return char
end
consume_number_entity(isHex) click to toggle source

This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present @token_queue << {:type => :ParseError}" is invoked.

# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 84
def consume_number_entity(isHex)

  # XXX More need to be done here. For instance, #13 should prolly be
  # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
  # such. Thoughts on this appreciated.
  allowed = DIGITS
  radix = 10
  if isHex
    allowed = HEX_DIGITS
    radix = 16
  end

  char_stack = []

  # Consume all the characters that are in range while making sure we
  # don't hit an EOF.
  c = @stream.char
  while allowed.include?(c) and c != :EOF
    char_stack.push(c)
    c = @stream.char
  end

  # Convert the set of characters consumed to an int.
  charAsInt = char_stack.join('').to_i(radix)

  if charAsInt == 13
    @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
    charAsInt = 10
  elsif (128..159).include? charAsInt
    # If the integer is between 127 and 160 (so 128 and bigger and 159
    # and smaller) we need to do the "windows trick".
    @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}

    charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
  end

  if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
    char = [charAsInt].pack('U')
  else
    char = [0xFFFD].pack('U')
    @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
  end

  # Discard the ; if present. Otherwise, put it back on the queue and
  # invoke parse_error on parser.
  if c != ";"
    @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
    @stream.unget(c)
  end

  return char
end
data_state() click to toggle source

XXX AT Perhaps we should have Hixie run some evaluation on billions of documents to figure out what the order of the various if and elsif statements should be.

# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 249
def data_state
  data = @stream.char

  if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
    @lastFourChars << data
    @lastFourChars.shift if @lastFourChars.length > 4
  end

  if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
      @state = :entity_data_state
  elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
      @escapeFlag = true
      @token_queue << {:type => :Characters, :data => data}
  elsif data == "<" and !@escapeFlag and
    [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
      @state = :tag_open_state
  elsif data == ">" and @escapeFlag and 
    [:CDATA,:RCDATA].include?(@content_model_flag) and
    @lastFourChars[1..-1].join('') == "-->"
      @escapeFlag = false
      @token_queue << {:type => :Characters, :data => data}

  elsif data == :EOF
    # Tokenization ends.
    return false

  elsif SPACE_CHARACTERS.include? data
    # Directly after emitting a token you switch back to the "data
    # state". At that point SPACE_CHARACTERS are important so they are
    # emitted separately.
    # XXX need to check if we don't need a special "spaces" flag on
    # characters.
    @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
  else
    @token_queue << {:type => :Characters, :data => data + @stream.chars_until(]& < > -])}
  end
  return true
end
doctype_name_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 743
def doctype_name_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
    @state = :after_doctype_name_state
  elsif data == ">"
    @token_queue << @current_token
    @state = :data_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:name] += data
  end

  return true
end
doctype_public_identifier_double_quoted_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 819
def doctype_public_identifier_double_quoted_state
  data = @stream.char
  if data == "\""
    @state = :after_doctype_public_identifier_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:publicId] += data
  end
  return true
end
doctype_public_identifier_single_quoted_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 834
def doctype_public_identifier_single_quoted_state
  data = @stream.char
  if data == "'"
    @state = :after_doctype_public_identifier_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:publicId] += data
  end
  return true
end
doctype_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 711
def doctype_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
    @state = :before_doctype_name_state
  else
    @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
    @stream.unget(data)
    @state = :before_doctype_name_state
  end
  return true
end
doctype_system_identifier_double_quoted_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 899
def doctype_system_identifier_double_quoted_state
  data = @stream.char
  if data == "\""
    @state = :after_doctype_system_identifier_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:systemId] += data
  end
  return true
end
doctype_system_identifier_single_quoted_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 914
def doctype_system_identifier_single_quoted_state
  data = @stream.char
  if data == "'"
    @state = :after_doctype_system_identifier_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
    @current_token[:correct] = false
    @token_queue << @current_token
    @state = :data_state
  else
    @current_token[:systemId] += data
  end
  return true
end
each() click to toggle source

This is where the magic happens.

We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested.

# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 49
def each
  @token_queue = []
  # Start processing. When EOF is reached @state will return false
  # instead of true and the loop will terminate.
  while send @state
    yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
    yield @token_queue.shift until @token_queue.empty?
  end
end
emit_current_token() click to toggle source

This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted.

# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 231
def emit_current_token
  # Add token to the queue to be yielded
  token = @current_token
  if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
    if @lowercase_element_name
      token[:name] = token[:name].downcase
    end
    @token_queue << token
    @state = :data_state
  end
  
end
entity_data_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 288
def entity_data_state
  entity = consume_entity
  if entity
    @token_queue << {:type => :Characters, :data => entity}
  else
    @token_queue << {:type => :Characters, :data => "&"}
  end
  @state = :data_state
  return true
end
markup_declaration_open_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 599
def markup_declaration_open_state
  char_stack = [@stream.char, @stream.char]
  if char_stack == ["-", "-"]
    @current_token = {:type => :Comment, :data => ""}
    @state = :comment_start_state
  else
    5.times { char_stack.push(@stream.char) }
    # Put in explicit :EOF check
    if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
      @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
      @state = :doctype_state
    else
      @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
      @stream.unget(char_stack)
      @state = :bogus_comment_state
    end
  end
  return true
end
process_entity_in_attribute() click to toggle source

This method replaces the need for "entityInAttributeValueState".

# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 219
def process_entity_in_attribute
  entity = consume_entity()
  if entity
    @current_token[:data][-1][1] += entity
  else
    @current_token[:data][-1][1] += "&"
  end
end
process_solidus_in_tag() click to toggle source

If the next character is a '>', convert the current_token into an EmptyTag

# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 64
def process_solidus_in_tag

  # We need to consume another character to make sure it's a ">"
  data = @stream.char

  if @current_token[:type] == :StartTag and data == ">"
    @current_token[:type] = :EmptyTag
  else
    @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
  end

  # The character we just consumed need to be put back on the stack so it
  # doesn't get lost...
  @stream.unget(data)
end
tag_name_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 403
def tag_name_state
  data = @stream.char
  if SPACE_CHARACTERS.include? data
    @state = :before_attribute_name_state
  elsif data == :EOF
    @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
    emit_current_token
  elsif ASCII_LETTERS.include? data
    @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
  elsif data == ">"
    emit_current_token
  elsif data == "/"
    process_solidus_in_tag
    @state = :before_attribute_name_state
  else
    @current_token[:name] += data
  end
  return true
end
tag_open_state() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 299
def tag_open_state
  data = @stream.char
  if @content_model_flag == :PCDATA
    if data == "!"
      @state = :markup_declaration_open_state
    elsif data == "/"
      @state = :close_tag_open_state
    elsif data != :EOF and ASCII_LETTERS.include? data
      @current_token = {:type => :StartTag, :name => data, :data => []}
      @state = :tag_name_state
    elsif data == ">"
      # XXX In theory it could be something besides a tag name. But
      # do we really care?
      @token_queue << {:type => :ParseError, :data =>       "expected-tag-name-but-got-right-bracket"}
      @token_queue << {:type => :Characters, :data => "<>"}
      @state = :data_state
    elsif data == "?"
      # XXX In theory it could be something besides a tag name. But
      # do we really care?
      @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
      @stream.unget(data)
      @state = :bogus_comment_state
    else
      # XXX
      @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
      @token_queue << {:type => :Characters, :data => "<"}
      @stream.unget(data)
      @state = :data_state
    end
  else
    # We know the content model flag is set to either RCDATA or CDATA
    # now because this state can never be entered with the PLAINTEXT
    # flag.
    if data == "/"
      @state = :close_tag_open_state
    else
      @token_queue << {:type => :Characters, :data => "<"}
      @stream.unget(data)
      @state = :data_state
    end
  end
  return true
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.