Parent

Class/Module Index [+]

Quicksearch

HTML5::HTMLInputStream

This class takes care of character encoding and removing or replacing incorrect byte-sequences and also provides column and line tracking.

Attributes

char_encoding[RW]
errors[RW]
queue[RW]

Public Class Methods

new(source, options = {}) click to toggle source

Initialises the HTMLInputStream.

HTMLInputStream(source, [encoding]) -> Normalized stream from source for use by the HTML5Lib.

source can be either a file-object, local filename or a string.

The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)

parseMeta - Look for a <meta> element containing encoding information

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 29
def initialize(source, options = {})
  @encoding   = nil
  @parse_meta = true
  @chardet    = true

  options.each {|name, value| instance_variable_set("@#{name}", value) }

  # Raw Stream
  @raw_stream = open_stream(source)

  # Encoding Information
  #Number of bytes to use when looking for a meta element with
  #encoding information
  @NUM_BYTES_META = 512
  #Number of bytes to use when using detecting encoding using chardet
  @NUM_BYTES_CHARDET = 256
  #Number of bytes to use when reading content
  @NUM_BYTES_BUFFER = 1024

  #Encoding to use if no other information can be found
  @DEFAULT_ENCODING = 'windows-1252'

  #Detect encoding iff no explicit "transport level" encoding is supplied
  if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
    @char_encoding = detect_encoding
  else
    @char_encoding = @encoding
  end

  # Read bytes from stream decoding them into Unicode
  @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
  if @char_encoding == 'windows-1252'
    @win1252 = true
  elsif @char_encoding != 'utf-8'
    require 'iconv'
    begin
      @buffer << @raw_stream.read unless @raw_stream.eof?
      @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
    rescue
      @win1252 = true
    end
  end

  @queue = []
  @errors = []

  # Reset position in the list to read from
  @tell = 0
  @line = @col = 0
  @line_lengths = []
end

Public Instance Methods

char() click to toggle source

Read one character from the stream or queue if available. Return EOF when EOF is reached.

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 252
def char
  unless @queue.empty?
    return @queue.shift
  else
    if @tell + 3 > @buffer.length && !@raw_stream.eof?
      # read next block
      @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
      @tell = 0
    end

    c = @buffer[@tell]
    @tell += 1

    case c
    when 0x01..0x7F
      if c == 0x0D
        # normalize newlines
        @tell += 1 if @buffer[@tell] == 0x0A
        c = 0x0A
      end

      # update position in stream
      if c == 0x0a
        @line_lengths << @col
        @line += 1
        @col = 0
      else
        @col += 1
      end

      c.chr

    when 0x80..0xBF
      if !@win1252
        [0xFFFD].pack('U') # invalid utf-8
      elsif c <= 0x9f
        [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
      else
        "\xC2" + c.chr # convert to utf-8
      end

    when 0xC0..0xFF
      if instance_variables.include?("@win1252") && @win1252
        "\xC3" + (c - 64).chr # convert to utf-8
      # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
      elsif @buffer[@tell - 1..@tell + 3] =~ /^
            ( [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
            |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
            | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
            |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
            |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
            | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
            |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
            )/
        @tell += $1.length - 1
        $1
      else
        [0xFFFD].pack('U') # invalid utf-8
      end

    when 0x00
      @errors.push("null-character")
      [0xFFFD].pack('U') # null characters are invalid

    else
      :EOF
    end
  end
end
chars_until(characters, opposite=false) click to toggle source

Returns a string of characters from the stream up to but not including any character in characters or EOF. characters can be any container that supports the in method being called on it.

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 325
def chars_until(characters, opposite=false)
  char_stack = [char]

  while char_stack.last != :EOF
    break unless (characters.include?(char_stack.last)) == opposite
    char_stack.push(char)
  end

  # Put the character stopped on back to the front of the queue
  # from where it came.
  c = char_stack.pop
  @queue.insert(0, c) unless c == :EOF
  return char_stack.join('')
end
detect_bom() click to toggle source

Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return nil

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 147
def detect_bom
  bom_dict = {
    "\xef\xbb\xbf"     => 'utf-8',
    "\xff\xfe"         => 'utf-16le',
    "\xfe\xff"         => 'utf-16be',
    "\xff\xfe\x00\x00" => 'utf-32le',
    "\x00\x00\xfe\xff" => 'utf-32be'
  }

  # Go to beginning of file and read in 4 bytes
  string = @raw_stream.read(4)
  return nil unless string

  # Try detecting the BOM using bytes from the string
  encoding = bom_dict[string[0...3]]      # UTF-8
  seek = 3
  unless encoding
    # Need to detect UTF-32 before UTF-16
    encoding = bom_dict[string]       # UTF-32
    seek = 4
    unless encoding
      encoding = bom_dict[string[0...2]]  # UTF-16
      seek = 2
    end
  end

  # Set the read position past the BOM if one was found, otherwise
  # set it to the start of the stream
  seek(string, encoding ? seek : 0)

  return encoding
end
detect_encoding() click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 94
def detect_encoding

  #First look for a BOM
  #This will also read past the BOM if present
  encoding = detect_bom

  #If there is no BOM need to look for meta elements with encoding 
  #information
  if encoding.nil? and @parse_meta
    encoding = detect_encoding_meta
  end

  #Guess with chardet, if avaliable
  if encoding.nil? and @chardet
    begin
      require 'rubygems'
      require 'UniversalDetector' # gem install chardet
      buffers = []
      detector = UniversalDetector::Detector.instance
      detector.reset
      until @raw_stream.eof?
        buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
        break if !buffer or buffer.empty?
        buffers << buffer
        detector.feed(buffer)
        break if detector.instance_eval {@done}
        detector.instance_eval {
          @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
        }
      end
      detector.close
      encoding = detector.result['encoding']
      seek(buffers*'', 0)
    rescue LoadError
    end
  end

  # If all else fails use the default encoding
  if encoding.nil?
    encoding = @DEFAULT_ENCODING
  end

  #Substitute for equivalent encoding
  if 'iso-8859-1' == encoding.downcase
    encoding = 'windows-1252'
  end

  encoding
end
detect_encoding_meta() click to toggle source

Report the encoding declared by the meta element

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 228
def detect_encoding_meta
  buffer = @raw_stream.read(@NUM_BYTES_META)
  parser = EncodingParser.new(buffer)
  seek(buffer, 0)
  return parser.get_encoding
end
open_stream(source) click to toggle source

Produces a file object from source.

source can be either a file object, local filename or a string.

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 84
def open_stream(source)
  # Already an IO like object
  if source.respond_to?(:read)
    source
  else
    # Treat source as a string and wrap in StringIO
    StringIO.new(source)
  end
end
position() click to toggle source

Returns (line, col) of the current position in the stream.

# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 236
def position
  line, col = @line, @col
  @queue.reverse.each do |c|
    if c == "\n"
      line -= 1
      raise RuntimeError.new("col=#{col}") unless col == 0
      col = @line_lengths[line]
    else
      col -= 1
    end 
  end
  return [line + 1, col]
end
seek(buffer, n) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 180
def seek(buffer, n)
  if @raw_stream.respond_to?(:unget)
    @raw_stream.unget(buffer[n..-1])
    return
  end

  if @raw_stream.respond_to?(:seek)
    begin
      @raw_stream.seek(n)
      return
    rescue Errno::ESPIPE
    end
  end

  #TODO: huh?
  require 'delegate'
  @raw_stream = SimpleDelegator.new(@raw_stream)

  class << @raw_stream
    def read(chars=-1)
      if chars == -1 or chars > @data.length
        result = @data
        @data = ''
        return result if __getobj__.eof?
        return result + __getobj__.read if chars == -1
        return result + __getobj__.read(chars-result.length)
      elsif @data.empty?
        return __getobj__.read(chars)
      else
        result = @data[1...chars]
        @data = @data[chars..-1]
        return result
      end
    end

    def unget(data)
      if !@data or @data.empty?
        @data = data
      else
        @data += data
      end
    end
  end

  @raw_stream.unget(buffer[n .. -1])
end
unget(characters) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 340
def unget(characters)
  @queue.unshift(*characters.to_a) unless characters == :EOF
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.