Parent

Class/Module Index [+]

Quicksearch

HTML5::EncodingParser

Mini parser for detecting character encoding from meta elements

Public Class Methods

new(data) click to toggle source

string - the data to work on for encoding detection

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 412
def initialize(data)
  @data = EncodingBytes.new(data.to_s)
  @encoding = nil
end

Public Instance Methods

get_attribute() click to toggle source

Return a name,value pair for the next attribute in the stream, if one is found, or nil

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 514
def get_attribute
  @data.skip(SPACE_CHARACTERS + ['/'])

  if @data.current_byte == '<'
    @data.position -= 1
    return nil
  elsif @data.current_byte == '>'
    return nil
  end

  attr_name = []
  attr_value = []
  space_found = false
  #Step 5 attribute name
  while true
    if @data.current_byte == '=' and attr_name
      break
    elsif SPACE_CHARACTERS.include?(@data.current_byte)
      space_found = true
      break
    elsif ['/', '<', '>'].include?(@data.current_byte)
      return [attr_name.join(''), '']
    elsif ASCII_UPPERCASE.include?(@data.current_byte)
      attr_name.push(@data.current_byte.downcase)
    else
      attr_name.push(@data.current_byte)
    end
    #Step 6
    @data.position += 1
  end
  #Step 7
  if space_found
    @data.skip
    #Step 8
    unless @data.current_byte == '='
      @data.position -= 1
      return [attr_name.join(''), '']
    end
  end
  #XXX need to advance position in both spaces and value case
  #Step 9
  @data.position += 1
  #Step 10
  @data.skip
  #Step 11
  if ["'", '"'].include?(@data.current_byte)
    #11.1
    quote_char = @data.current_byte
    while true
      @data.position+=1
      #11.3
      if @data.current_byte == quote_char
        @data.position += 1
        return [attr_name.join(''), attr_value.join('')]
      #11.4
      elsif ASCII_UPPERCASE.include?(@data.current_byte)
        attr_value.push(@data.current_byte.downcase)
      #11.5
      else
        attr_value.push(@data.current_byte)
      end
    end
  elsif ['>', '<'].include?(@data.current_byte)
    return [attr_name.join(''), '']
  elsif ASCII_UPPERCASE.include?(@data.current_byte)
    attr_value.push(@data.current_byte.downcase)
  else
    attr_value.push(@data.current_byte)
  end
  while true
    @data.position += 1
    if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
      return [attr_name.join(''), attr_value.join('')]
    elsif ASCII_UPPERCASE.include?(@data.current_byte)
      attr_value.push(@data.current_byte.downcase)
    else
      attr_value.push(@data.current_byte)
    end
  end
end
get_encoding() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 426
def get_encoding
  @data.each do |byte|
    keep_parsing = true
    @@method_dispatch.each do |(key, method)|
      if @data.match_bytes(key, lower = true)
        keep_parsing = send(method)
        break
      end
    end
    break unless keep_parsing
  end
  @encoding = @encoding.strip unless @encoding.nil?
  return @encoding
end
handle_comment() click to toggle source

Skip over comments

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 442
def handle_comment
  return @data.jump_to('-->')
end
handle_meta() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 446
def handle_meta
  # if we have <meta not followed by a space so just keep going
  return true unless SPACE_CHARACTERS.include?(@data.current_byte)

  #We have a valid meta element we want to search for attributes
  while true
    #Try to find the next attribute after the current position
    attr = get_attribute

    return true if attr.nil?
    
    if attr[0] == 'charset'
      tentative_encoding = attr[1]
      if HTML5.is_valid_encoding(tentative_encoding)
        @encoding = tentative_encoding  
        return false
      end
    elsif attr[0] == 'content'
      content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
      tentative_encoding = content_parser.parse
      if HTML5.is_valid_encoding(tentative_encoding)
        @encoding = tentative_encoding
        return false
      end
    end
  end
end
handle_other() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 508
def handle_other
  return @data.jump_to('>')
end
handle_possible_end_tag() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 478
def handle_possible_end_tag
  @data.position += 1
  return handle_possible_tag(true)
end
handle_possible_start_tag() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 474
def handle_possible_start_tag
  return handle_possible_tag(false)
end
handle_possible_tag(end_tag) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 483
def handle_possible_tag(end_tag)
  unless ASCII_LETTERS.include?(@data.current_byte)
    #If the next byte is not an ascii letter either ignore this
    #fragment (possible start tag case) or treat it according to 
    #handleOther
    if end_tag
      @data.position -= 1
      handle_other
    end
    return true
  end

  @data.find_next(SPACE_CHARACTERS + ['<', '>'])

  if @data.current_byte == '<'
    #return to the first step in the overall "two step" algorithm
    #reprocessing the < byte
    @data.position -= 1  
  else
    #Read all attributes
    {} until get_attribute.nil?
  end
  return true
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.