! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors fry html.parser html.parser.analyzer
-http.client kernel tools.time sets assocs sequences
-concurrency.combinators io threads namespaces math multiline
-math.parser inspector urls logging combinators.short-circuit
-continuations calendar prettyprint dlists deques locals
-spider.unique-deque combinators concurrency.semaphores ;
+USING: accessors assocs calendar combinators.short-circuit
+concurrency.semaphores deques html.parser html.parser.analyzer
+http.client inspector io io.pathnames kernel logging math
+math.parser prettyprint sequences sets spider.unique-deque
+threads tools.time urls ;
IN: spider
TUPLE: spider
"depth: " write number>string write
", spidering: " write . yield ;
+: url-html? ( url -- ? )
+ path>> file-extension { ".htm" ".html" f } member? ;
+
:: fill-spidered-result ( spider spider-result -- )
- f spider-result url>> spider spidered>> set-at
- [ spider-result url>> http-get ] benchmark :> ( headers html fetched-in )
+ spider-result url>> :> url
+ f url spider spidered>> set-at
+ [ url http-get ] benchmark :> ( headers html fetched-in )
[
- html parse-html
- spider currently-spidering>>
- over find-all-links normalize-hrefs
+ url url-html? [
+ html parse-html
+ spider currently-spidering>>
+ over find-all-links normalize-hrefs
+ ] [
+ f { }
+ ] if
] benchmark :> ( parsed-html links processed-in )
spider-result
headers >>headers
dup todo>> pop-url [ url>> ] [ depth>> ] bi <spider-result> ;
: spider-next-page ( spider -- )
- setup-next-url spider-page ;
+ setup-next-url
+ spider-page ;
PRIVATE>