This module provides sanitization of XHTML+MathML+SVG and of inline style attributes.
It can be either at the Tokenizer stage:
HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
or, if you already have a parse tree (in this example, a REXML tree), at the Serializer stage:
tokens = TreeWalkers.get_tree_walker('rexml').new(tree) HTMLSerializer.serialize(tokens, {:encoding=>'utf-8', :sanitize => true})
subclasses may define their own versions of these constants
# File lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb, line 151 def sanitize_css(style) # disallow urls style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ') # gauntlet return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/ clean = [] style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val| next if val.empty? prop.downcase! if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop) clean << "#{prop}: #{val};" elsif ]background border margin padding].include?(prop.split('-')[0]) clean << "#{prop}: #{val};" unless val.split().any? do |keyword| !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/ end elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop) clean << "#{prop}: #{val};" end end style = clean.join(' ') end
# File lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb, line 110 def sanitize_token(token) case token[:type] when :StartTag, :EndTag, :EmptyTag if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name]) if token.has_key? :data attrs = Hash[*token[:data].flatten] attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) } ATTR_VAL_IS_URI.each do |attr| val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\0000-\0040\1177\s]+|\3302[\2200-\2240]/,'').downcase if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0]) attrs.delete attr end end if attrs['style'] attrs['style'] = sanitize_css(attrs['style']) end token[:data] = attrs.map {|k,v| [k,v]} end return token else if token[:type] == :EndTag token[:data] = "</#{token[:name]}>" elsif token[:data] attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('') token[:data] = "<#{token[:name]}#{attrs}>" else token[:data] = "<#{token[:name]}>" end token[:data].insert(-2,'/') if token[:type] == :EmptyTag token[:type] = :Characters token.delete(:name) return token end when :Comment token[:data] = "" return token else return token end end
Generated with the Darkfish Rdoc Generator 2.