class Robotex::ParsedRobots

Public Class Methods

new(uri, user_agent) click to toggle source
# File lib/robotex.rb, line 18
def initialize(uri, user_agent)
  io = Robotex.get_robots_txt(uri, user_agent)
  
  if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
    io = StringIO.new("User-agent: *\nAllow: /\n")
  end

  @disallows = {}
  @allows = {}
  @delays = {}
  agent = /.*/
  io.each do |line|
    next if line =~ /^\s*(#.*|$)/
    arr = line.split(":")
    key = arr.shift
    value = arr.join(":").strip
    value.strip!
    case key.downcase
      when "user-agent"
        agent = to_regex(value)
      when "allow"
        @allows[agent] ||= []
        @allows[agent] << to_regex(value)
      when "disallow"
        @disallows[agent] ||= []
        @disallows[agent] << to_regex(value)
      when "crawl-delay"
        @delays[agent] = value.to_i
    end
  end
  
  @parsed = true
end

Public Instance Methods

allowed?(uri, user_agent) click to toggle source
# File lib/robotex.rb, line 52
def allowed?(uri, user_agent)
  return true unless @parsed
  allowed = true
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
  path = uri.request_uri
  
  @allows.each do |key, value|
    unless allowed      
      if user_agent =~ key
        value.each do |rule|
          if path =~ rule
            allowed = true
          end
        end
      end
    end
  end
  
  @disallows.each do |key, value|
    if user_agent =~ key
      value.each do |rule|
        if path =~ rule
          allowed = false
        end
      end
    end
  end
  
  return allowed
end
delay(user_agent) click to toggle source
# File lib/robotex.rb, line 83
def delay(user_agent)
  @delays.each do |agent, delay|
    return delay if agent =~ user_agent
  end
  nil
end

Protected Instance Methods

to_regex(pattern) click to toggle source
# File lib/robotex.rb, line 92
def to_regex(pattern)
  return /should-not-match-anything-123456789/ if pattern.strip.empty?
  pattern = Regexp.escape(pattern)
  pattern.gsub!(Regexp.escape("*"), ".*")
  Regexp.compile("^#{pattern}")
end