1 ! Copyright (C) 2021 Doug Coleman.
2 ! See https://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators
4 combinators.short-circuit kernel make math modern modern.slices
5 sequences sequences.extras shuffle combinators.extras splitting
9 TUPLE: tag open name props close children ;
11 TUPLE: processing-instruction open target props close ;
12 : <processing-instruction> ( open target props close -- processing-instruction )
13 processing-instruction new
19 TUPLE: embedded-language open payload close ;
20 : <embedded-language> ( open payload close -- embedded-language )
26 TUPLE: doctype open close values ;
27 : <doctype> ( open values close -- doctype )
33 TUPLE: cdata open close value ;
34 : <cdata> ( open value close -- doctype )
40 TUPLE: comment open payload close ;
41 : <comment> ( open payload close -- comment )
47 TUPLE: close-tag name ;
48 : <close-tag> ( name -- tag )
50 swap >string rest rest but-last >>name ;
52 TUPLE: open-tag < tag close-tag ;
53 : <open-tag> ( open name props close -- tag )
59 V{ } clone >>children ;
61 TUPLE: self-close-tag < tag ;
62 : <self-close-tag> ( open name props close -- tag )
68 V{ } clone >>children ;
71 : <text> ( text -- text )
75 TUPLE: squote payload ;
77 TUPLE: dquote payload ;
80 : advance-dquote-payload-noescape ( n string -- n' string )
82 { CHAR: \" } slice-til-separator-inclusive {
83 { f [ to>> over string-expected-got-eof ] }
87 string-expected-got-eof
90 : advance-squote-payload-noescape ( n string -- n' string )
92 { CHAR: ' } slice-til-separator-inclusive {
93 { f [ to>> over string-expected-got-eof ] }
97 string-expected-got-eof
100 :: read-string ( $n $string $char -- n' string payload )
101 $n $string $char CHAR: ' =
102 [ advance-squote-payload-noescape ]
103 [ advance-dquote-payload-noescape ] if drop :> $n'
105 $n $n' 1 - $string <slice> ;
107 : take-tag-name ( n string -- n' string tag )
108 [ "\t\s\r\n/>" member? ] slice-until ;
110 : read-value ( n string -- n' string value )
112 { CHAR: ' [ CHAR: ' read-string >string <squote> ] }
113 { CHAR: " [ CHAR: " read-string >string <dquote> ] }
114 { CHAR: [ [ "[" throw ] }
115 { CHAR: { [ "{" throw ] }
116 [ [ take-tag-name ] dip prefix ]
119 : read-prop ( n string -- n' string prop/f closing/f )
120 skip-whitespace "\s\n\r\t\"'<=/>?" slice-til-either {
121 { CHAR: < [ "< error" throw ] }
122 { CHAR: = [ 1 split-slice-back drop >string [ read-value ] dip swap 2array f ] }
123 { CHAR: / [ ">" expect-and-span 2 split-slice-back [ >string f like ] bi@ ] }
124 { CHAR: > [ 1 split-slice-back [ >string f like ] bi@ ] }
125 { CHAR: ' [ first read-string >string <squote> f ] }
126 { CHAR: " [ first read-string >string <dquote> f ] }
127 { CHAR: ? [ ">" expect-and-span >string f swap ] }
128 { CHAR: \s [ [ 1 + ] 2dip >string f ] }
129 { CHAR: \r [ [ 1 + ] 2dip >string f ] }
130 { CHAR: \n [ [ 1 + ] 2dip >string f ] }
131 { CHAR: \t [ [ 1 + ] 2dip >string f ] }
132 { f [ "efff" throw ] }
135 : read-props* ( n string props -- n' string props closing )
136 [ read-prop ] dip-2up [
137 [ [ over push ] when* ] dip
139 [ over push ] when* read-props*
140 ] if* ; inline recursive
142 : read-props ( n string -- n' string props closing )
143 V{ } clone read-props* ;
145 : read-processing-instruction ( n string opening -- n string processing-instruction )
146 "?" expect-and-span >string
147 [ take-tag-name >string ] dip-1up
148 [ read-props ] 2dip-2up
149 <processing-instruction> ;
151 : read-doctype-or-cdata ( n string opening -- n string doctype/comment )
153 2over 2 peek-from "--" sequence= [
154 "--" expect-and-span >string
155 [ "-->" slice-til-string [ >string ] bi@ ] dip-2up <comment>
157 2over 1 peek-from "[" sequence= [
158 "[CDATA[" expect-and-span-insensitive
159 [ "]]" slice-til-string [ >string ] bi@ ] dip-2up <cdata>
161 "DOCTYPE" expect-and-span-insensitive
162 [ read-props ] dip-2up
167 : read-embedded-language ( n string opening -- n string embedded-language )
168 "%" expect-and-span >string
169 [ take-tag-name >string ] dip-1up append
170 [ "%>" slice-til-string [ >string ] bi@ ] dip-2up
171 <embedded-language> ;
173 : read-open-tag ( n string opening -- n' string tag )
174 [ take-tag-name ] dip-1up
175 [ read-props ] 2dip-2up
182 : read-close-tag ( n string opening -- n' string tag )
184 [ take-tag-name ] dip span-slices
188 : unclosed-open-tag? ( obj -- ? )
189 { [ open-tag? ] [ close-tag>> not ] } 1&& ; inline
191 ERROR: unmatched-open-tags-error stack seq ;
192 : check-tag-stack ( stack -- stack )
193 dup [ unclosed-open-tag? ] filter
194 [ unmatched-open-tags-error ] unless-empty ;
196 ERROR: unmatched-closing-tag-error stack tag ;
197 :: find-last-open-tag ( stack name -- seq )
198 stack [ { [ unclosed-open-tag? ] [ name>> name = ] } 1&& ] find-last drop [
201 stack name unmatched-closing-tag-error
204 : lex-html ( stack n string -- stack n' string )
205 "<" slice-til-either {
207 1 split-slice-back [ >string f like [ reach push ] when* ] dip
208 [ 2dup peek1-from ] dip
211 read-close-tag reach over name>> find-last-open-tag unclip
212 swap check-tag-stack >>children
215 { CHAR: ! [ read-doctype-or-cdata ] }
216 { CHAR: ? [ read-processing-instruction ] }
217 { CHAR: % [ read-embedded-language ] }
218 [ drop read-open-tag ]
222 [ drop >string <text> ]
223 } case [ reach push lex-html ] when* ;
225 : string>html ( string -- sequence )
226 [ V{ } clone 0 ] dip lex-html 2drop check-tag-stack ;
228 GENERIC: write-html ( tag -- )
230 : >value ( obj -- string )
232 { [ dup squote? ] [ payload>> "'" 1surround ] }
233 { [ dup dquote? ] [ payload>> "\"" 1surround ] }
237 M: doctype write-html
239 [ values>> [ >value ] map " " join [ " " % % ] unless-empty ]
247 : write-props ( seq -- )
249 dup array? [ first2 >value "=" glue ] [ >value ] if
250 ] map " " join [ " " % % ] unless-empty ;
252 M: processing-instruction write-html
256 [ props>> write-props ]
260 M: open-tag write-html
264 [ props>> write-props ]
266 [ children>> [ write-html ] each ]
267 [ close-tag>> name>> "</" ">" surround % ]
270 M: self-close-tag write-html
274 [ props>> write-props ]
278 M: comment write-html
283 M: string write-html % ;
285 : html>string ( sequence -- string )
286 [ [ write-html ] each ] "" make ;
288 GENERIC#: walk-html 1 ( seq/tag quot -- )
290 M: sequence walk-html [ walk-html ] curry each ;
291 M: string walk-html call( obj -- ) ;
292 M: doctype walk-html call( obj -- ) ;
293 M: processing-instruction walk-html call( obj -- ) ;
294 M: embedded-language walk-html call( obj -- ) ;
295 M: open-tag walk-html [ call( obj -- ) ] 2keep [ children>> ] dip [ walk-html ] curry each ;
296 M: self-close-tag walk-html [ call( obj -- ) ] 2keep [ children>> ] dip [ walk-html ] curry each ;
297 M: comment walk-html call( obj -- ) ;
299 : find-links ( seq -- links )
303 props>> [ drop >lower "href" = ] assoc-find
304 [ nip , ] [ 2drop ] if
307 ] { } make [ payload>> ] map ;