1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors fry html.parser html.parser.analyzer
4 http.client kernel tools.time sets assocs sequences
5 concurrency.combinators io threads namespaces math multiline
6 heaps math.parser inspector urls assoc-heaps logging
7 combinators.short-circuit continuations calendar prettyprint ;
10 TUPLE: spider base count max-count sleep max-depth initial-links
11 filters spidered todo nonmatching quiet ;
12 ! secure? agent page-timeout data-timeout overall-timeout
14 TUPLE: spider-result url depth headers fetch-time parsed-html
15 links processing-time timestamp ;
17 : <spider> ( base -- spider )
21 swap 0 <unique-min-heap> [ heap-push ] keep >>todo
22 <unique-min-heap> >>nonmatching
26 H{ } clone >>spidered ;
30 : relative-url? ( url -- ? ) protocol>> not ;
32 : apply-filters ( links spider -- links' )
33 filters>> [ '[ _ 1&& ] filter ] when* ;
35 : push-links ( links level assoc-heap -- )
36 '[ _ _ heap-push ] each ;
38 : add-todo ( links level spider -- )
41 : add-nonmatching ( links level spider -- )
42 nonmatching>> push-links ;
44 : filter-base ( spider spider-result -- base-links nonmatching-links )
45 [ base>> host>> ] [ links>> prune ] bi*
46 [ host>> = ] with partition ;
48 : add-spidered ( spider spider-result -- )
49 [ [ 1+ ] change-count ] dip
50 2dup [ spidered>> ] [ dup url>> ] bi* rot set-at
54 [ tuck [ apply-filters ] 2dip add-todo ] 2bi ;
56 : normalize-hrefs ( links -- links' )
58 spider get base>> swap [ derive-url ] with map ;
60 : print-spidering ( url depth -- )
61 "depth: " write number>string write
62 ", spidering: " write . yield ;
64 : (spider-page) ( url depth -- spider-result )
65 f pick spider get spidered>> set-at
66 over '[ _ http-get ] benchmark swap
67 [ parse-html dup find-hrefs normalize-hrefs ] benchmark
70 : spider-page ( url depth -- )
71 spider get quiet>> [ 2dup print-spidering ] unless
73 spider get [ quiet>> [ dup describe ] unless ]
74 [ swap add-spidered ] bi ;
76 \ spider-page ERROR add-error-logging
79 spider get sleep>> [ sleep ] when* ;
81 : queue-initial-links ( spider -- spider )
82 [ initial-links>> normalize-hrefs 0 ] keep
87 : run-spider ( spider -- spider )
91 [ todo>> ] [ max-depth>> ] bi
94 [ count>> ] [ max-count>> ] bi < and
95 ] [ spider-page spider-sleep ] slurp-heap-when