! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs calendar.format combinators
-combinators.short-circuit fry globs http.client kernel make
-math.parser multiline namespaces present regexp
-regexp.combinators sequences sets splitting splitting.monotonic
-unicode.case unicode.categories urls ;
+USING: accessors arrays assocs calendar.parser combinators
+combinators.short-circuit globs http.client kernel math.parser
+namespaces present regexp regexp.combinators sequences splitting
+splitting.monotonic unicode urls ;
IN: robots
! visit-time is GMT, request-rate is pages/second
>robots.txt-url http-get nip ;
: normalize-robots.txt ( string -- sitemaps seq )
- string-lines
- [ [ blank? ] trim ] map
+ split-lines
+ [ [ unicode:blank? ] trim ] map
[ "#" head? ] reject harvest
- [ ":" split1 [ [ blank? ] trim ] bi@ [ >lower ] dip ] { } map>assoc
+ [ ":" split1 [ [ unicode:blank? ] trim ] bi@ [ >lower ] dip ] { } map>assoc
[ first "sitemap" = ] partition [ values ] dip
[
{
{ "crawl-delay" [ string>number >>crawl-delay ] }
{ "request-rate" [ string>number >>request-rate ] }
{
- "visit-time" [ "-" split1 [ hhmm>timestamp ] bi@ 2array
+ "visit-time" [ "-" split1 [ hhmm>duration ] bi@ 2array
>>visit-time
] }
[ pick unknowns>> push-at ]