1 ! Copyright (C) 2009 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors http.client kernel unicode.categories
4 sequences urls splitting combinators splitting.monotonic
5 combinators.short-circuit assocs unicode.case arrays
6 math.parser calendar.format make fry present globs
7 multiline regexp.combinators regexp ;
10 ! visit-time is GMT, request-rate is pages/second
11 ! crawl-rate is seconds
13 TUPLE: robots site sitemap rules rules-quot ;
15 : <robots> ( site sitemap rules -- robots )
21 TUPLE: rules user-agents allows disallows
22 visit-time request-rate crawl-delay unknowns ;
26 : >robots.txt-url ( url -- url' )
27 >url URL" robots.txt" derive-url ;
29 : get-robots.txt ( url -- headers robots.txt )
30 >robots.txt-url http-get ;
32 : normalize-robots.txt ( string -- sitemaps seq )
34 [ [ blank? ] trim ] map
35 [ "#" head? not ] filter harvest
36 [ ":" split1 [ [ blank? ] trim ] bi@ [ >lower ] dip ] { } map>assoc
37 [ first "sitemap" = ] partition [ values ] dip
40 [ [ first "user-agent" = ] bi@ and ]
41 [ nip first "user-agent" = not ]
45 : <rules> ( -- rules )
47 V{ } clone >>user-agents
49 V{ } clone >>disallows
50 H{ } clone >>unknowns ;
52 : add-user-agent ( rules agent -- rules ) over user-agents>> push ;
53 : add-allow ( rules allow -- rules ) >url over allows>> push ;
54 : add-disallow ( rules disallow -- rules ) >url over disallows>> push ;
56 : parse-robots.txt-line ( rules seq -- rules )
58 { "user-agent" [ add-user-agent ] }
59 { "allow" [ add-allow ] }
60 { "disallow" [ add-disallow ] }
61 { "crawl-delay" [ string>number >>crawl-delay ] }
62 { "request-rate" [ string>number >>request-rate ] }
64 "visit-time" [ "-" split1 [ hhmm>timestamp ] bi@ 2array
67 [ pick unknowns>> push-at ]
70 : derive-urls ( url seq -- seq' )
71 [ derive-url present ] with { } map-as ;
73 : robot-rules-quot ( robots -- quot )
75 [ site>> ] [ rules>> allows>> ] bi
76 derive-urls [ <glob> ] map
79 [ site>> ] [ rules>> disallows>> ] bi
80 derive-urls [ <glob> ] map <and> <not>
81 ] bi 2array <or> '[ _ matches? ] ;
85 : parse-robots.txt ( string -- sitemaps rules-seq )
86 normalize-robots.txt [
87 [ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
90 : robots ( url -- robots )
92 dup get-robots.txt nip parse-robots.txt <robots> ;