1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors fry html.parser html.parser.analyzer
4 http.client kernel tools.time sets assocs sequences
5 concurrency.combinators io threads namespaces math multiline
6 heaps math.parser inspector urls assoc-heaps logging
7 combinators.short-circuit continuations calendar prettyprint ;
10 TUPLE: spider base count max-count sleep max-depth initial-links
11 filters spidered todo nonmatching quiet ;
13 TUPLE: spider-result url depth headers fetch-time parsed-html
14 links processing-time timestamp ;
16 : <spider> ( base -- spider )
20 swap 0 <unique-min-heap> [ heap-push ] keep >>todo
21 <unique-min-heap> >>nonmatching
25 H{ } clone >>spidered ;
29 : apply-filters ( links spider -- links' )
30 filters>> [ '[ _ 1&& ] filter ] when* ;
32 : push-links ( links level assoc-heap -- )
33 '[ _ _ heap-push ] each ;
35 : add-todo ( links level spider -- )
38 : add-nonmatching ( links level spider -- )
39 nonmatching>> push-links ;
41 : filter-base ( spider spider-result -- base-links nonmatching-links )
42 [ base>> host>> ] [ links>> prune ] bi*
43 [ host>> = ] with partition ;
45 : add-spidered ( spider spider-result -- )
46 [ [ 1+ ] change-count ] dip
47 2dup [ spidered>> ] [ dup url>> ] bi* rot set-at
51 [ tuck [ apply-filters ] 2dip add-todo ] 2bi ;
53 : normalize-hrefs ( links -- links' )
55 spider get base>> swap [ derive-url ] with map ;
57 : print-spidering ( url depth -- )
58 "depth: " write number>string write
59 ", spidering: " write . yield ;
61 : (spider-page) ( url depth -- spider-result )
62 f pick spider get spidered>> set-at
63 over '[ _ http-get ] benchmark swap
64 [ parse-html dup find-hrefs normalize-hrefs ] benchmark
65 now spider-result boa ;
67 : spider-page ( url depth -- )
68 spider get quiet>> [ 2dup print-spidering ] unless
70 spider get [ quiet>> [ dup describe ] unless ]
71 [ swap add-spidered ] bi ;
73 \ spider-page ERROR add-error-logging
76 spider get sleep>> [ sleep ] when* ;
78 : queue-initial-links ( spider -- spider )
79 [ initial-links>> normalize-hrefs 0 ] keep
82 : slurp-heap-while ( heap quot1 quot2: ( value key -- ) -- )
83 pick heap-empty? [ 3drop ] [
84 [ [ heap-pop dup ] 2dip slip [ t ] compose [ 2drop f ] if ]
85 [ roll [ slurp-heap-while ] [ 3drop ] if ] 3bi
86 ] if ; inline recursive
90 : run-spider ( spider -- spider )
94 [ todo>> ] [ max-depth>> ] bi
97 [ count>> ] [ max-count>> ] bi < and
98 ] [ spider-page spider-sleep ] slurp-heap-while