Parent

Class/Module Index [+]

Quicksearch

HTML5::HTMLParser

HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML

Attributes

errors[R]
first_start_tag[RW]
inner_html[RW]
insert_from_table[RW]
last_phase[RW]
phase[RW]
phases[R]
tokenizer[R]
tree[R]

Public Class Methods

new(options = {}) click to toggle source

:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through HTML5::TreeBuilders

# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 41
def initialize(options = {})
  @strict = false
  @errors = []
 
  @tokenizer =  HTMLTokenizer
  @tree = TreeBuilders::REXML::TreeBuilder

  options.each {|name, value| instance_variable_set("@#{name}", value) }
  @lowercase_attr_name    = nil unless instance_variables.include?("@lowercase_attr_name")
  @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")

  @tree = @tree.new

  @phases = @@phases.inject({}) do |phases, phase_name|
    phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
    phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
    phases
  end
end
parse(stream, options = {}) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 23
def self.parse(stream, options = {})
  encoding = options.delete(:encoding)
  new(options).parse(stream,encoding)
end
parse_fragment(stream, options = {}) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 28
def self.parse_fragment(stream, options = {})
  container = options.delete(:container) || 'div'
  encoding = options.delete(:encoding)
  new(options).parse_fragment(stream, container, encoding)
end

Public Instance Methods

_(string) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 245
def _(string); string; end
_parse(stream, inner_html, encoding, container = 'div') click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 61
def _parse(stream, inner_html, encoding, container = 'div')
  @tree.reset
  @first_start_tag = false
  @errors = []

  @tokenizer = @tokenizer.class unless Class === @tokenizer
  @tokenizer = @tokenizer.new(stream, :encoding => encoding,
    :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)

  if inner_html
    case @inner_html = container.downcase
    when 'title', 'textarea'
      @tokenizer.content_model_flag = :RCDATA
    when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
      @tokenizer.content_model_flag = :CDATA
    when 'plaintext'
      @tokenizer.content_model_flag = :PLAINTEXT
    else
      # content_model_flag already is PCDATA
      @tokenizer.content_model_flag = :PCDATA
    end
  
    @phase = @phases[:rootElement]
    @phase.insert_html_element
    reset_insertion_mode
  else
    @inner_html = false
    @phase = @phases[:initial]
  end

  # We only seem to have InBodyPhase testcases where the following is
  # relevant ... need others too
  @last_phase = nil

  # XXX This is temporary for the moment so there isn't any other
  # changes needed for the parser to work with the iterable tokenizer
  @tokenizer.each do |token|
    token = normalize_token(token)

    method = 'process%s' % token[:type]

    case token[:type]
    when :Characters, :SpaceCharacters, :Comment
      @phase.send method, token[:data]
    when :StartTag
      @phase.send method, token[:name], token[:data]
    when :EndTag
      @phase.send method, token[:name]
    when :Doctype
      @phase.send method, token[:name], token[:publicId],
        token[:systemId], token[:correct]
    else
      parse_error(token[:data], token[:datavars])
    end
  end

  # When the loop finishes it's EOF
  @phase.process_eof
end
normalize_token(token) click to toggle source

HTML5 specific normalizations to the token stream

# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 157
def normalize_token(token)

  if token[:type] == :EmptyTag
    # When a solidus (/) is encountered within a tag name what happens
    # depends on whether the current tag name matches that of a void
    # element.  If it matches a void element atheists did the wrong
    # thing and if it doesn't it's wrong for everyone.

    unless VOID_ELEMENTS.include?(token[:name])
      parse_error("incorrectly-placed-solidus")
    end

    token[:type] = :StartTag
  end

  if token[:type] == :StartTag
    token[:name] = token[:name].downcase

    # We need to remove the duplicate attributes and convert attributes
    # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}

    unless token[:data].empty?
      data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
      token[:data] = Hash[*data.flatten]
    end

  elsif token[:type] == :EndTag
    parse_error("attributes-in-end-tag") unless token[:data].empty?
    token[:name] = token[:name].downcase
  end

  token
end
parse(stream, encoding=nil) click to toggle source

Parse a HTML document into a well-formed tree

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 129
def parse(stream, encoding=nil)
  _parse(stream, false, encoding)
  @tree.get_document
end
parse_error(code = 'XXX-undefined-error', data = {}) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 150
def parse_error(code = 'XXX-undefined-error', data = {})
  # XXX The idea is to make data mandatory.
  @errors.push([@tokenizer.stream.position, code, data])
  raise ParseError if @strict
end
parse_fragment(stream, container='div', encoding=nil) click to toggle source

container - name of the element we're setting the inner_html property if set to nil, default to 'div'

stream - a filelike object or string containing the HTML to be parsed

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 145
def parse_fragment(stream, container='div', encoding=nil)
  _parse(stream, true, encoding, container)
  @tree.get_fragment
end
reset_insertion_mode() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 207
def reset_insertion_mode
  # The name of this method is mostly historical. (It's also used in the
  # specification.)
  last = false

  @tree.open_elements.reverse.each do |node|
    node_name = node.name

    if node == @tree.open_elements.first
      last = true
      unless ['td', 'th'].include?(node_name)
        # XXX
        # assert @inner_html
        node_name = @inner_html
      end
    end

    # Check for conditions that should only happen in the inner_html
    # case
    if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
      # XXX
      # assert @inner_html
    end

    if @@new_modes.has_key?(node_name)
      @phase = @phases[@@new_modes[node_name]]
    elsif node_name == 'html'
      @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
    elsif last
      @phase = @phases[:inBody]
    else
      next
    end

    break
  end
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.