! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors http.client kernel unicode.categories
-sequences urls splitting combinators splitting.monotonic
-combinators.short-circuit assocs unicode.case arrays
-math.parser calendar.format make fry present globs
-multiline regexp.combinators regexp ;
+USING: accessors arrays assocs calendar.parser combinators
+combinators.short-circuit globs http.client kernel math.parser
+namespaces present regexp regexp.combinators sequences splitting
+splitting.monotonic unicode urls ;
IN: robots
-! visit-time is GMT, request-rate is pages/second
+! visit-time is GMT, request-rate is pages/second
! crawl-rate is seconds
+SYMBOL: robot-identities
+robot-identities [ { "FactorSpider" } ] initialize
+
TUPLE: robots site sitemap rules rules-quot ;
: <robots> ( site sitemap rules -- robots )
: >robots.txt-url ( url -- url' )
>url URL" robots.txt" derive-url ;
-: get-robots.txt ( url -- headers robots.txt )
- >robots.txt-url http-get ;
+: get-robots.txt ( url -- robots.txt )
+ >robots.txt-url http-get nip ;
: normalize-robots.txt ( string -- sitemaps seq )
- string-lines
- [ [ blank? ] trim ] map
- [ "#" head? not ] filter harvest
- [ ":" split1 [ [ blank? ] trim ] bi@ [ >lower ] dip ] { } map>assoc
+ split-lines
+ [ [ unicode:blank? ] trim ] map
+ [ "#" head? ] reject harvest
+ [ ":" split1 [ [ unicode:blank? ] trim ] bi@ [ >lower ] dip ] { } map>assoc
[ first "sitemap" = ] partition [ values ] dip
[
{
- [ [ first "user-agent" = ] bi@ and ]
+ [ [ first "user-agent" = ] both? ]
[ nip first "user-agent" = not ]
- } 2||
+ } 2||
] monotonic-split ;
: <rules> ( -- rules )
{ "crawl-delay" [ string>number >>crawl-delay ] }
{ "request-rate" [ string>number >>request-rate ] }
{
- "visit-time" [ "-" split1 [ hhmm>timestamp ] bi@ 2array
+ "visit-time" [ "-" split1 [ hhmm>duration ] bi@ 2array
>>visit-time
] }
[ pick unknowns>> push-at ]
derive-urls [ <glob> ] map <and> <not>
] bi 2array <or> '[ _ matches? ] ;
+: relevant-rules ( robots -- rules )
+ [
+ user-agents>> [
+ robot-identities get [ swap glob-matches? ] with any?
+ ] any?
+ ] filter ;
+
PRIVATE>
: parse-robots.txt ( string -- sitemaps rules-seq )
normalize-robots.txt [
[ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
- ] map first ;
+ ] map ;
-: robots ( url -- robots )
- >url
- dup get-robots.txt nip parse-robots.txt <robots> ;
+: url>robots ( url -- robots )
+ >url dup get-robots.txt parse-robots.txt <robots> ;