class Sanitize
Constants
- REGEX_PROTOCOL
Matches an attribute value that could be treated by a browser as a URL with a protocol prefix, such as “http:” or “javascript:”. Any string of zero or more characters followed by a colon is considered a match, even if the colon is encoded as an entity and even if it's an incomplete entity (which IE6 and Opera will still parse).
- REGEX_UNSUITABLE_CHARS
Matches Unicode characters that should be stripped from HTML before passing it to the parser.
- VERSION
Attributes
Public Class Methods
Returns a sanitized copy of the given full html document, using the settings in config if specified.
When sanitizing a document, the `<html>` element must be whitelisted or an error will be raised. If this is undesirable, you should probably use {#fragment} instead.
# File lib/sanitize.rb, line 44 def self.document(html, config = {}) Sanitize.new(config).document(html) end
Returns a sanitized copy of the given html fragment, using the settings in config if specified.
# File lib/sanitize.rb, line 50 def self.fragment(html, config = {}) Sanitize.new(config).fragment(html) end
Returns a new Sanitize object initialized with the settings in config.
# File lib/sanitize.rb, line 76 def initialize(config = {}) @config = Config.merge(Config::DEFAULT, config) @transformers = Array(@config[:transformers].dup) # Default transformers always run at the end of the chain, after any custom # transformers. @transformers << Transformers::CleanComment unless @config[:allow_comments] @transformers << Transformers::CleanDoctype unless @config[:allow_doctype] if @config[:elements].include?('style') scss = Sanitize::CSS.new(config) @transformers << Transformers::CSS::CleanElement.new(scss) end if @config[:attributes].values.any? {|attr| attr.include?('style') } scss ||= Sanitize::CSS.new(config) @transformers << Transformers::CSS::CleanAttribute.new(scss) end @transformers << Transformers::CleanCDATA << Transformers::CleanElement.new(@config) end
Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
# File lib/sanitize.rb, line 55 def self.node!(node, config = {}) Sanitize.new(config).node!(node) end
Public Instance Methods
Returns a sanitized copy of the given html document.
When sanitizing a document, the `<html>` element must be whitelisted or an error will be raised. If this is undesirable, you should probably use {#fragment} instead.
# File lib/sanitize.rb, line 106 def document(html) return '' unless html doc = Nokogiri::HTML5.parse(preprocess(html)) node!(doc) to_html(doc) end
Returns a sanitized copy of the given html fragment.
# File lib/sanitize.rb, line 118 def fragment(html) return '' unless html html = preprocess(html) doc = Nokogiri::HTML5.parse("<html><body>#{html}") # Hack to allow fragments containing <body>. Borrowed from # Nokogiri::HTML::DocumentFragment. if html =~ /\A<body(?:\s|>)/i path = '/html/body' else path = '/html/body/node()' end frag = doc.fragment frag << doc.xpath(path) node!(frag) to_html(frag) end
Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it in place.
If node is a `Nokogiri::XML::Document`, the `<html>` element must be whitelisted or an error will be raised.
# File lib/sanitize.rb, line 147 def node!(node) raise ArgumentError unless node.is_a?(Nokogiri::XML::Node) if node.is_a?(Nokogiri::XML::Document) unless @config[:elements].include?('html') raise Error, 'When sanitizing a document, "<html>" must be whitelisted.' end end node_whitelist = Set.new traverse(node) do |n| transform_node!(n, node_whitelist) end node end
Private Instance Methods
Preprocesses HTML before parsing to remove undesirable Unicode chars.
# File lib/sanitize.rb, line 171 def preprocess(html) html = html.to_s.dup unless html.encoding.name == 'UTF-8' html.encode!('UTF-8', :invalid => :replace, :undef => :replace) end html.gsub!(REGEX_UNSUITABLE_CHARS, '') html end
# File lib/sanitize.rb, line 184 def to_html(node) replace_meta = false # Hacky workaround for a libxml2 bug that adds an undesired Content-Type # meta tag to all serialized HTML documents. # # https://github.com/sparklemotion/nokogiri/issues/1008 if node.type == Nokogiri::XML::Node::DOCUMENT_NODE || node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i # Only replace the content-type meta tag if <meta> isn't whitelisted or # the original document didn't actually include a content-type meta tag. replace_meta = !@config[:elements].include?('meta') || node.xpath('/html/head/meta[@http-equiv]').none? do |meta| meta['http-equiv'].downcase == 'content-type' end end so = Nokogiri::XML::Node::SaveOptions # Serialize to HTML without any formatting to prevent Nokogiri from adding # newlines after certain tags. html = node.to_html( :encoding => 'utf-8', :indent => 0, :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML ) html.gsub!(regex_meta, '\1') if replace_meta html end
# File lib/sanitize.rb, line 218 def transform_node!(node, node_whitelist) @transformers.each do |transformer| result = transformer.call( :config => @config, :is_whitelisted => node_whitelist.include?(node), :node => node, :node_name => node.name.downcase, :node_whitelist => node_whitelist ) if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each) node_whitelist.merge(result[:node_whitelist]) end end node end
Performs top-down traversal of the given node, operating first on the node itself, then traversing each child (if any) in order.
# File lib/sanitize.rb, line 238 def traverse(node, &block) yield node child = node.child while child do prev = child.previous_sibling traverse(child, &block) if child.parent == node child = child.next_sibling else # The child was unlinked or reparented, so traverse the previous node's # next sibling, or the parent's first child if there is no previous # node. child = prev ? prev.next_sibling : node.child end end end