In Files

Parent

Files

Class/Module Index [+]

Quicksearch

Robots::ParsedRobots

Public Class Methods

new(uri, user_agent) click to toggle source
# File lib/robots.rb, line 12
def initialize(uri, user_agent)
  @last_accessed = Time.at(1)
  
  io = Robots.get_robots_txt(uri, user_agent)
  
  if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
    io = StringIO.new("User-agent: *\nAllow: /\n")
  end

  @other = {}
  @disallows = {}
  @allows = {}
  @delays = {} # added delays to make it work
  agent = /.*/
  io.each do |line|
    next if line =~ /^\s*(#.*|$)/
    arr = line.split(":")
    key = arr.shift
    value = arr.join(":").strip
    value.strip!
    case key
    when "User-agent"
      agent = to_regex(value)
    when "Allow"
      @allows[agent] ||= []
      @allows[agent] << to_regex(value)
    when "Disallow"
      @disallows[agent] ||= []
      @disallows[agent] << to_regex(value)
    when "Crawl-delay"
      @delays[agent] = value.to_i
    else
      @other[key] ||= []
      @other[key] << value
    end
  end
  
  @parsed = true
end

Public Instance Methods

allowed?(uri, user_agent) click to toggle source
# File lib/robots.rb, line 52
def allowed?(uri, user_agent)
  return true unless @parsed
  allowed = true
  path = uri.request_uri
  
  @disallows.each do |key, value|
    if user_agent =~ key
      value.each do |rule|
        if path =~ rule
          allowed = false
        end
      end
    end
  end
  
  @allows.each do |key, value|
    unless allowed      
      if user_agent =~ key
        value.each do |rule|
          if path =~ rule
            allowed = true
          end
        end
      end
    end
  end
  
  if allowed && @delays[user_agent]
    sleep @delays[user_agent] - (Time.now - @last_accessed)
    @last_accessed = Time.now
  end
  
  return allowed
end
other_values() click to toggle source
# File lib/robots.rb, line 87
def other_values
  @other
end

Protected Instance Methods

to_regex(pattern) click to toggle source
# File lib/robots.rb, line 93
def to_regex(pattern)
  return /should-not-match-anything-123456789/ if pattern.strip.empty?
  pattern = Regexp.escape(pattern)
  pattern.gsub!(Regexp.escape("*"), ".*")
  Regexp.compile("^#{pattern}")
end

[Validate]

Generated with the Darkfish Rdoc Generator 2.