1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays hashtables sequences.parser
4 html.parser.utils kernel namespaces sequences make math
5 unicode.case unicode.categories combinators.short-circuit
9 TUPLE: tag name attributes text closing? ;
22 : closing-tag? ( string -- ? )
24 [ { [ first CHAR: / = ] [ last CHAR: / = ] } 1|| ] if-empty ;
26 : <tag> ( name attributes closing? -- tag )
32 : make-tag ( string attribs -- tag )
33 [ [ closing-tag? ] keep "/" trim1 ] dip rot <tag> ;
35 : new-tag ( text name -- tag )
40 : (read-quote) ( sequence-parser ch -- string )
41 '[ [ current _ = ] take-until ] [ advance drop ] bi ;
43 : read-single-quote ( sequence-parser -- string )
44 CHAR: ' (read-quote) ;
46 : read-double-quote ( sequence-parser -- string )
47 CHAR: " (read-quote) ;
49 : read-quote ( sequence-parser -- string )
50 dup get+increment CHAR: ' =
51 [ read-single-quote ] [ read-double-quote ] if ;
53 : read-key ( sequence-parser -- string )
55 [ current { [ CHAR: = = ] [ blank? ] } 1|| ] take-until ;
57 : read-token ( sequence-parser -- string )
58 [ current blank? ] take-until ;
60 : read-value ( sequence-parser -- string )
62 dup current quote? [ read-quote ] [ read-token ] if
65 : read-comment ( sequence-parser -- )
66 [ "-->" take-until-sequence comment new-tag push-tag ]
67 [ '[ _ advance drop ] 3 swap times ] bi ;
69 : read-dtd ( sequence-parser -- )
70 [ ">" take-until-sequence dtd new-tag push-tag ]
73 : read-bang ( sequence-parser -- )
74 advance dup { [ current CHAR: - = ] [ peek-next CHAR: - = ] } 1&&
75 [ advance advance read-comment ] [ read-dtd ] if ;
77 : read-tag ( sequence-parser -- string )
79 [ current "><" member? ] take-until
80 [ CHAR: / = ] trim-tail
81 ] [ dup current CHAR: < = [ advance ] unless drop ] bi ;
83 : read-until-< ( sequence-parser -- string )
84 [ current CHAR: < = ] take-until ;
86 : parse-text ( sequence-parser -- )
87 read-until-< [ text new-tag push-tag ] unless-empty ;
89 : parse-key/value ( sequence-parser -- key value )
91 [ skip-whitespace "=" take-sequence ]
92 [ swap [ read-value ] [ drop dup ] if ] tri ;
94 : (parse-attributes) ( sequence-parser -- )
96 dup sequence-parse-end? [
99 [ parse-key/value swap ,, ] [ (parse-attributes) ] bi
102 : parse-attributes ( sequence-parser -- hashtable )
103 [ (parse-attributes) ] H{ } make ;
105 : (parse-tag) ( string -- string' hashtable )
107 [ read-token >lower ] [ parse-attributes ] bi
110 : read-< ( sequence-parser -- string/f )
111 advance dup current [
112 CHAR: ! = [ read-bang f ] [ read-tag ] if
117 : parse-tag ( sequence-parser -- )
118 read-< [ (parse-tag) make-tag push-tag ] unless-empty ;
120 : (parse-html) ( sequence-parser -- )
122 [ parse-text ] [ parse-tag ] [ (parse-html) ] tri
125 : tag-parse ( quot -- vector )
126 V{ } clone tagstack [ parse-sequence ] with-variable ; inline
130 : parse-html ( string -- vector )
131 [ (parse-html) tagstack get ] tag-parse ;