Class/Module Index [+]

Quicksearch

HTML5::HTMLSanitizeModule

This module provides sanitization of XHTML+MathML+SVG and of inline style attributes.

It can be either at the Tokenizer stage:

HTMLParser.parse(html, :tokenizer => HTMLSanitizer)

or, if you already have a parse tree (in this example, a REXML tree), at the Serializer stage:

tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
   :sanitize => true})

Public Instance Methods

sanitize_css(style) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb, line 151
def sanitize_css(style)
  # disallow urls
  style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')

  # gauntlet
  return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
  return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/

  clean = []
  style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
    next if val.empty?
    prop.downcase!
    if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
      clean << "#{prop}: #{val};"
    elsif ]background border margin padding].include?(prop.split('-')[0])
      clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
        !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
        keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
      end
    elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
      clean << "#{prop}: #{val};"
    end
  end

  style = clean.join(' ')
end
sanitize_token(token) click to toggle source
# File lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb, line 110
def sanitize_token(token)
    case token[:type]
    when :StartTag, :EndTag, :EmptyTag
      if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
        if token.has_key? :data
          attrs = Hash[*token[:data].flatten]
          attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
          ATTR_VAL_IS_URI.each do |attr|
            val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\0000-\0040\1177\s]+|\3302[\2200-\2240]/,'').downcase
            if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
              attrs.delete attr
            end
          end
          if attrs['style']
            attrs['style'] = sanitize_css(attrs['style'])
          end
          token[:data] = attrs.map {|k,v| [k,v]}
        end
        return token
      else
        if token[:type] == :EndTag
          token[:data] = "</#{token[:name]}>"
        elsif token[:data]
          attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
          token[:data] = "<#{token[:name]}#{attrs}>"
        else
          token[:data] = "<#{token[:name]}>"
        end
        token[:data].insert(-2,'/') if token[:type] == :EmptyTag
        token[:type] = :Characters
        token.delete(:name)
        return token
      end
    when :Comment
      token[:data] = ""
      return token
    else
      return token
    end
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.