1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors fry html.parser html.parser.analyzer
4 http.client kernel tools.time sets assocs sequences
5 concurrency.combinators io threads namespaces math multiline
6 heaps math.parser inspector urls assoc-deques logging
7 combinators.short-circuit continuations calendar prettyprint ;
10 TUPLE: spider base count max-count sleep max-depth initial-links
11 filters spidered todo nonmatching ;
12 ! secure? agent page-timeout data-timeout overall-timeout
14 TUPLE: spider-result url depth headers fetch-time parsed-html
15 links processing-time timestamp ;
17 : <spider> ( base -- spider )
21 swap 0 <unique-min-heap> [ heap-push ] keep >>todo
22 <unique-min-heap> >>nonmatching
26 H{ } clone >>spidered ;
30 : apply-filters ( links spider -- links' )
31 filters>> [ '[ _ 1&& ] filter ] when* ;
33 : add-todo ( links level spider -- )
34 tuck [ apply-filters ] 2dip
36 [ spidered>> keys diff ]
37 [ todo>> ] 2bi* '[ _ _ heap-push ] each ;
39 : add-nonmatching ( links level spider -- )
40 nonmatching>> '[ _ _ heap-push ] each ;
42 : relative-url? ( url -- ? ) protocol>> not ;
44 : filter-base ( spider spider-result -- base-links nonmatching-links )
45 [ base>> host>> ] [ links>> prune ] bi*
46 [ host>> = ] with partition ;
48 : add-spidered ( spider spider-result -- )
49 [ [ 1+ ] change-count ] dip
50 2dup [ spidered>> ] [ dup url>> ] bi* rot set-at
56 : print-spidering ( url depth -- )
57 "depth: " write number>string write
58 ", spidering: " write . yield ;
60 : normalize-hrefs ( links -- links' )
62 spider get base>> swap [ derive-url ] with map ;
64 : (spider-page) ( url depth -- spider-result )
66 f pick spider get spidered>> set-at
67 over '[ _ http-get ] benchmark swap
68 [ parse-html dup find-hrefs normalize-hrefs ] benchmark
72 : spider-page ( url depth -- )
73 (spider-page) spider get swap add-spidered ;
75 \ spider-page ERROR add-error-logging
78 spider get sleep>> [ sleep ] when* ;
80 : queue-initial-links ( spider -- spider )
81 [ initial-links>> normalize-hrefs 0 ] keep
86 : run-spider ( spider -- spider )
90 [ todo>> ] [ max-depth>> ] bi
93 [ count>> ] [ max-count>> ] bi < and
94 ] [ spider-page spider-sleep ] slurp-heap-when