! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays html.parser.utils hashtables io kernel
namespaces make prettyprint quotations sequences splitting
-state-parser strings unicode.categories unicode.case ;
+html.parser.state strings unicode.categories unicode.case ;
IN: html.parser
TUPLE: tag name attributes text closing? ;
[ get-char CHAR: " = ] take-until ;
: read-quote ( -- string )
- get-char next* CHAR: ' =
- [ read-single-quote ] [ read-double-quote ] if next* ;
+ get-char next CHAR: ' =
+ [ read-single-quote ] [ read-double-quote ] if next ;
: read-key ( -- string )
read-whitespace*
: read-= ( -- )
read-whitespace*
- [ get-char CHAR: = = ] take-until drop next* ;
+ [ get-char CHAR: = = ] take-until drop next ;
: read-value ( -- string )
read-whitespace*
[ blank? ] trim ;
: read-comment ( -- )
- "-->" take-string* make-comment-tag push-tag ;
+ "-->" take-string make-comment-tag push-tag ;
: read-dtd ( -- )
- ">" take-string* make-dtd-tag push-tag ;
+ ">" take-string make-dtd-tag push-tag ;
: read-bang ( -- )
- next* get-char CHAR: - = get-next CHAR: - = and [
- next* next*
+ next get-char CHAR: - = get-next CHAR: - = and [
+ next next
read-comment
] [
read-dtd
: read-tag ( -- string )
[ get-char CHAR: > = get-char CHAR: < = or ] take-until
- get-char CHAR: < = [ next* ] unless ;
+ get-char CHAR: < = [ next ] unless ;
: read-< ( -- string )
- next* get-char CHAR: ! = [
+ next get-char CHAR: ! = [
read-bang f
] [
read-tag
--- /dev/null
+USING: tools.test html.parser.state ascii kernel ;
+IN: html.parser.state.tests
+
+: take-rest ( -- string )
+ [ f ] take-until ;
+
+: take-char ( -- string )
+ [ get-char = ] curry take-until ;
+
+[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
+[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
+[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
+! [ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
--- /dev/null
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: namespaces math kernel sequences accessors fry circular ;
+IN: html.parser.state
+
+TUPLE: state string i ;
+
+: get-i ( -- i ) state get i>> ;
+
+: get-char ( -- char )
+ state get [ i>> ] [ string>> ] bi ?nth ;
+
+: get-next ( -- char )
+ state get [ i>> 1+ ] [ string>> ] bi ?nth ;
+
+: next ( -- )
+ state get [ 1+ ] change-i drop ;
+
+: string-parse ( string quot -- )
+ [ 0 state boa state ] dip with-variable ;
+
+: short* ( n seq -- n' seq )
+ over [ nip dup length swap ] unless ;
+
+: skip-until ( quot: ( -- ? ) -- )
+ get-char [
+ [ call ] keep swap
+ [ drop ] [ next skip-until ] if
+ ] [ drop ] if ; inline recursive
+
+: take-until ( quot: ( -- ? ) -- )
+ [ get-i ] dip skip-until get-i
+ state get string>> subseq ;
+
+: string-matches? ( string circular -- ? )
+ get-char over push-circular sequence= ;
+
+: take-string ( match -- string )
+ dup length <circular-string>
+ [ 2dup string-matches? ] take-until nip
+ dup length rot length 1- - head next ;
USING: assocs combinators continuations hashtables
hashtables.private io kernel math
namespaces prettyprint quotations sequences splitting
-state-parser strings tools.test ;
+strings tools.test ;
USING: html.parser.utils ;
IN: html.parser.utils.tests
! See http://factorcode.org/license.txt for BSD license.
USING: assocs circular combinators continuations hashtables
hashtables.private io kernel math namespaces prettyprint
-quotations sequences splitting state-parser strings
+quotations sequences splitting html.parser.state strings
combinators.short-circuit ;
IN: html.parser.utils
: string-parse-end? ( -- ? ) get-next not ;
-: take-string* ( match -- string )
- dup length <circular-string>
- [ 2dup string-matches? ] take-until nip
- dup length rot length 1- - head next* ;
-
: trim1 ( seq ch -- newseq )
[ [ ?head-slice drop ] [ ?tail-slice drop ] bi ] 2keep drop like ;