]> gitweb.factorcode.org Git - factor.git/blob - extra/html5/html5.factor
scryfall: better moxfield words
[factor.git] / extra / html5 / html5.factor
1 ! Copyright (C) 2020 Doug Coleman.
2 ! See https://factorcode.org/license.txt for BSD license.
3
4 USING: accessors arrays assocs combinators
5 combinators.short-circuit io io.encodings.utf8 io.files json
6 kernel math math.order memoize modern.slices prettyprint
7 sequences sequences.extras strings suffix-arrays words ;
8
9 IN: html5
10
11 : 1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip over push ; inline
12 : ?1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip [ over push ] when* ; inline
13
14 ! https://html.spec.whatwg.org/multipage/parsing.html#tokenization
15
16 ! https://infra.spec.whatwg.org/#namespaces
17 CONSTANT: html-namespace "http://www.w3.org/1999/xhtml"
18 CONSTANT: mathml-namespace "http://www.w3.org/1998/Math/MathML"
19 CONSTANT: svg-namespace "http://www.w3.org/2000/svg"
20 CONSTANT: xlink-namespace "http://www.w3.org/1999/xlink"
21 CONSTANT: xml-namespace "http://www.w3.org/XML/1998/namespace"
22 CONSTANT: xmlns-namespace "http://www.w3.org/2000/xmlns/"
23
24 DEFER: data-state
25 DEFER: (data-state)
26 DEFER: rcdata-state
27 DEFER: (rcdata-state)
28 DEFER: rawtext-state
29 DEFER: (rawtext-state)
30 DEFER: script-data-state
31 DEFER: (script-data-state)
32 DEFER: plaintext-state
33 DEFER: (plaintext-state)
34 DEFER: tag-open-state
35 DEFER: (tag-open-state)
36 DEFER: end-tag-open-state
37 DEFER: (end-tag-open-state)
38 DEFER: tag-name-state
39 DEFER: (tag-name-state)
40 DEFER: rcdata-less-than-sign-state
41 DEFER: (rcdata-less-than-sign-state)
42 DEFER: rcdata-end-tag-open-state
43 DEFER: (rcdata-end-tag-open-state)
44 DEFER: rcdata-end-tag-name-state
45 DEFER: (rcdata-end-tag-name-state)
46 DEFER: rawtext-less-than-sign-state
47 DEFER: (rawtext-less-than-sign-state)
48 DEFER: rawtext-end-tag-open-state
49 DEFER: (rawtext-end-tag-open-state)
50 DEFER: rawtext-end-tag-name-state
51 DEFER: (rawtext-end-tag-name-state)
52 DEFER: script-data-less-than-sign-state
53 DEFER: (script-data-less-than-sign-state)
54 DEFER: script-data-end-tag-open-state
55 DEFER: (script-data-end-tag-open-state)
56 DEFER: script-data-end-tag-name-state
57 DEFER: (script-data-end-tag-name-state)
58 DEFER: script-data-escape-start-state
59 DEFER: (script-data-escape-start-state)
60 DEFER: script-data-escape-start-dash-state
61 DEFER: (script-data-escape-start-dash-state)
62 DEFER: script-data-escaped-state
63 DEFER: (script-data-escaped-state)
64 DEFER: script-data-escaped-dash-state
65 DEFER: (script-data-escaped-dash-state)
66 DEFER: script-data-escaped-dash-dash-state
67 DEFER: (script-data-escaped-dash-dash-state)
68 DEFER: script-data-escaped-less-than-sign-state
69 DEFER: (script-data-escaped-less-than-sign-state)
70 DEFER: script-data-escaped-end-tag-open-state
71 DEFER: (script-data-escaped-end-tag-open-state)
72 DEFER: script-data-escaped-end-tag-name-state
73 DEFER: (script-data-escaped-end-tag-name-state)
74 DEFER: script-data-double-escape-start-state
75 DEFER: (script-data-double-escape-start-state)
76 DEFER: script-data-double-escaped-state
77 DEFER: (script-data-double-escaped-state)
78 DEFER: script-data-double-escaped-dash-state
79 DEFER: (script-data-double-escaped-dash-state)
80 DEFER: script-data-double-escaped-dash-dash-state
81 DEFER: (script-data-double-escaped-dash-dash-state)
82 DEFER: script-data-double-escaped-less-than-sign-state
83 DEFER: (script-data-double-escaped-less-than-sign-state)
84 DEFER: script-data-double-escape-end-state
85 DEFER: (script-data-double-escape-end-state)
86 DEFER: before-attribute-name-state
87 DEFER: (before-attribute-name-state)
88 DEFER: attribute-name-state
89 DEFER: (attribute-name-state)
90 DEFER: after-attribute-name-state
91 DEFER: (after-attribute-name-state)
92 DEFER: before-attribute-value-state
93 DEFER: (before-attribute-value-state)
94 DEFER: attribute-value-double-quoted-state
95 DEFER: (attribute-value-double-quoted-state)
96 DEFER: attribute-value-single-quoted-state
97 DEFER: (attribute-value-single-quoted-state)
98 DEFER: attribute-value-unquoted-state
99 DEFER: (attribute-value-unquoted-state)
100 DEFER: after-attribute-value-quoted-state
101 DEFER: (after-attribute-value-quoted-state)
102 DEFER: self-closing-start-tag-state
103 DEFER: (self-closing-start-tag-state)
104 DEFER: bogus-comment-state
105 DEFER: (bogus-comment-state)
106 DEFER: markup-declaration-open-state
107 DEFER: (markup-declaration-open-state)
108 DEFER: comment-start-state
109 DEFER: (comment-start-state)
110 DEFER: comment-start-dash-state
111 DEFER: (comment-start-dash-state)
112 DEFER: comment-state
113 DEFER: (comment-state)
114 DEFER: comment-less-than-sign-state
115 DEFER: (comment-less-than-sign-state)
116 DEFER: comment-less-than-sign-bang-state
117 DEFER: (comment-less-than-sign-bang-state)
118 DEFER: comment-less-than-sign-bang-dash-state
119 DEFER: (comment-less-than-sign-bang-dash-state)
120 DEFER: comment-less-than-sign-bang-dash-dash-state
121 DEFER: (comment-less-than-sign-bang-dash-dash-state)
122 DEFER: comment-end-dash-state
123 DEFER: (comment-end-dash-state)
124 DEFER: comment-end-state
125 DEFER: (comment-end-state)
126 DEFER: comment-end-bang-state
127 DEFER: (comment-end-bang-state)
128 DEFER: doctype-state
129 DEFER: (doctype-state)
130 DEFER: before-doctype-name-state
131 DEFER: (before-doctype-name-state)
132 DEFER: doctype-name-state
133 DEFER: (doctype-name-state)
134 DEFER: after-doctype-name-state
135 DEFER: (after-doctype-name-state)
136 DEFER: after-doctype-public-keyword-state
137 DEFER: (after-doctype-public-keyword-state)
138 DEFER: before-doctype-public-identifier-state
139 DEFER: (before-doctype-public-identifier-state)
140 DEFER: doctype-public-identifier-double-quoted-state
141 DEFER: (doctype-public-identifier-double-quoted-state)
142 DEFER: doctype-public-identifier-single-quoted-state
143 DEFER: (doctype-public-identifier-single-quoted-state)
144 DEFER: after-doctype-public-identifier-state
145 DEFER: (after-doctype-public-identifier-state)
146 DEFER: between-doctype-public-and-system-identifiers-state
147 DEFER: (between-doctype-public-and-system-identifiers-state)
148 DEFER: after-doctype-system-keyword-state
149 DEFER: (after-doctype-system-keyword-state)
150 DEFER: before-doctype-system-identifier-state
151 DEFER: (before-doctype-system-identifier-state)
152 DEFER: doctype-system-identifier-double-quoted-state
153 DEFER: (doctype-system-identifier-double-quoted-state)
154 DEFER: doctype-system-identifier-single-quoted-state
155 DEFER: (doctype-system-identifier-single-quoted-state)
156 DEFER: after-doctype-system-identifier-state
157 DEFER: (after-doctype-system-identifier-state)
158 DEFER: bogus-doctype-state
159 DEFER: (bogus-doctype-state)
160 DEFER: cdata-section-state
161 DEFER: (cdata-section-state)
162 DEFER: cdata-section-bracket-state
163 DEFER: (cdata-section-bracket-state)
164 DEFER: cdata-section-end-state
165 DEFER: (cdata-section-end-state)
166 DEFER: character-reference-state
167 DEFER: (character-reference-state)
168 DEFER: named-character-reference-state
169 DEFER: (named-character-reference-state)
170 DEFER: ambiguous-ampersand-state
171 DEFER: (ambiguous-ampersand-state)
172 DEFER: numeric-character-reference-state
173 DEFER: (numeric-character-reference-state)
174 DEFER: hexadecimal-character-reference-start-state
175 DEFER: (hexadecimal-character-reference-start-state)
176 DEFER: decimal-character-reference-start-state
177 DEFER: (decimal-character-reference-start-state)
178 DEFER: hexadecimal-character-reference-state
179 DEFER: (hexadecimal-character-reference-state)
180 DEFER: decimal-character-reference-state
181 DEFER: (decimal-character-reference-state)
182 DEFER: numeric-character-reference-end-state
183 DEFER: (numeric-character-reference-end-state)
184
185
186 ERROR: unimplemented string ;
187 ERROR: unimplemented* ;
188
189 ! Errors: https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
190 ERROR: abrupt-closing-of-empty-comment ;
191 ERROR: abrupt-doctype-public-identifier ;
192 ERROR: abrupt-doctype-system-identifier ;
193 ERROR: absence-of-digits-in-numeric-character-reference ;
194 ERROR: cdata-in-html-content ;
195 ERROR: character-reference-outside-unicode-range ;
196 ERROR: control-character-in-input-stream ;
197 ERROR: control-character-reference ;
198 ERROR: end-tag-with-attributes ;
199 ERROR: duplicate-attribute ;
200 ERROR: end-tag-with-trailing-solidus ;
201 ERROR: eof-before-tag-name ;
202 ERROR: eof-in-cdata ;
203 ERROR: eof-in-comment ;
204 ERROR: eof-in-doctype ;
205 ERROR: eof-in-script-html-comment-like-text ;
206 ERROR: eof-in-tag ;
207 ERROR: incorrectly-closed-comment ;
208 ERROR: incorrectly-opened-comment ;
209 ERROR: invalid-character-sequence-after-doctype-name ;
210 ERROR: invalid-first-character-of-tag-name ;
211 ERROR: missing-attribute-value ;
212 ERROR: missing-doctype-name ;
213 ERROR: missing-doctype-public-identifier ;
214 ERROR: missing-doctype-system-identifier ;
215 ERROR: missing-end-tag-name ;
216 ERROR: missing-quote-before-doctype-public-identifier ;
217
218 ERROR: missing-quote-before-doctype-system-identifier ;
219 ERROR: missing-semicolon-after-character-reference ;
220 ERROR: missing-whitespace-after-doctype-public-keyword ;
221 ERROR: missing-whitespace-after-doctype-system-keyword ;
222 ERROR: missing-whitespace-before-doctype-name ;
223 ERROR: missing-whitespace-between-attributes ;
224 ERROR: missing-whitespace-between-doctype-public-and-system-identifiers ;
225 ERROR: nested-comment ;
226 ERROR: noncharacter-character-reference ;
227 ERROR: noncharacter-in-input-stream ;
228 ERROR: non-void-html-element-start-tag-with-trailing-solidus ;
229 ERROR: null-character-reference ;
230 ERROR: surrogate-character-reference ;
231 ERROR: surrogate-in-input-stream ;
232 ERROR: unexpected-character-after-doctype-system-identifier ;
233 ERROR: unexpected-character-in-attribute-name ;
234 ERROR: unexpected-character-in-unquoted-attribute-value ;
235 ERROR: unexpected-equals-sign-before-attribute-name ;
236 ERROR: unexpected-null-character ;
237 ERROR: unexpected-question-mark-instead-of-tag-name ;
238 ERROR: unexpected-solidus-in-tag ;
239 ERROR: unknown-named-character-reference ;
240
241 ! Tree insertion modes
242 SINGLETONS: initial-mode before-html-mode before-head-mode
243 in-head-mode in-head-noscript-mode after-head-mode
244 in-body-mode text-mode in-table-mode in-table-text-mode
245 in-caption-mode in-column-group-mode in-table-body-mode
246 in-row-mode in-cell-mode in-select-mode in-select-in-table-mode in-template-mode
247 after-body-mode in-frameset-mode after-frameset-mode after-after-body-mode
248 after-after-frameset-mode ;
249
250 TUPLE: document
251 quirks-mode?
252 limited-quirks-mode?
253 iframe-srcdoc?
254 scripting? ! set in constructor
255 frameset-ok? ! frameset-ok? but we want default to f
256 fostering-parent?
257 tree
258 tree-doctype
259 head-element-pointer ! set during insertion time
260 parser-cannot-change-mode-flag
261 insertion-mode
262 original-insertion-mode
263 last
264 node
265 context
266 doctype
267 tag
268 end-tag
269
270 tag-name
271 end-tag-name
272 attribute-name
273 attribute-value
274 temporary-buffer
275 comment-token
276 open-elements
277 return-state ;
278
279 ! "reset the insertion mode appropriately"
280 ! : reset-insertion-mode ( document -- document )
281 !     f >>last
282 !     dup open-elements>> ?last >>node
283 !     dup [ open-elements>> ?first ] [ node>> ] bi = [
284 !         t >>last dup node>> >>context
285 !     ] when
286 !     dup node>> {
287 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
288 !         { [
289 !             dup name>> >lower { "td" "th" } member?
290 !             pick last>> f = and
291 !         ] [ drop in-select >>insertion-mode ] }
292 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
293 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
294 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
295 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
296 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
297 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
298 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
299 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
300 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
301 !     } cond
302 !     ;
303
304 : temporary-buffer-attribute? ( document -- ? )
305     return-state>>
306     {
307         attribute-value-unquoted-state
308         attribute-value-single-quoted-state
309         attribute-value-double-quoted-state
310     } member? ;
311
312 ! name, public/system identifier should not be empty strings
313 ! until the state machine demands it
314 TUPLE: doctype
315     name
316     public-identifier
317     system-identifier
318     quirks? ;
319
320 : <doctype> ( -- doctype )
321     doctype new ; inline
322
323 : new-doctype-from-ch ( ch document -- )
324     [
325         doctype new
326             swap ?1sbuf >>name
327     ] dip doctype<< ; inline
328
329 : new-doctype-with-quirks ( document -- )
330     <doctype> t >>quirks? >>doctype drop ;
331
332 TUPLE: tag self-closing? name attributes children end-tag ;
333
334 : <tag> ( -- tag )
335     tag new
336         SBUF" " clone >>name
337         V{ } clone >>attributes
338         V{ } clone >>children ;
339
340 TUPLE: end-tag self-closing? name attributes ;
341
342 : <end-tag> ( -- tag )
343     end-tag new
344         SBUF" " clone >>name
345         V{ } clone >>attributes ;
346
347 : new-tag ( document -- )
348     <tag> >>tag drop ;
349
350 : new-end-tag ( document -- )
351     <end-tag> >>tag drop ;
352
353 : set-self-closing ( document -- )
354     tag>> t >>self-closing? drop ;
355
356 : <document> ( -- document )
357     document new
358         V{ } clone >>tree
359         initial-mode >>insertion-mode
360         <doctype> >>doctype
361         t >>frameset-ok?
362         ! SBUF" " clone >>tag-name
363         SBUF" " clone >>attribute-name
364         SBUF" " clone >>attribute-value
365         SBUF" " clone >>temporary-buffer
366         SBUF" " clone >>comment-token
367         V{ } clone >>open-elements
368     ; inline
369
370 TUPLE: comment open payload close ;
371
372 : <comment> ( payload -- comment )
373     comment new
374         swap >>payload ; inline
375
376 : force-quirks ( document -- )
377     doctype>> t >>quirks? drop ;
378
379 : initialize-doctype-name ( document -- )
380     [ SBUF" " clone ] dip doctype>> name<< ;
381
382 : initialize-doctype-public-identifier ( document -- )
383     [ SBUF" " clone ] dip doctype>> public-identifier<< ;
384
385 : initialize-doctype-system-identifier ( document -- )
386     [ SBUF" " clone ] dip doctype>> system-identifier<< ;
387
388 : push-doctype-name ( ch document -- )
389     doctype>> name>> push ;
390
391 : push-doctype-public-identifier ( ch document -- )
392     doctype>> public-identifier>> push ;
393
394 : push-doctype-system-identifier ( ch document -- )
395     doctype>> system-identifier>> push ;
396
397 ! XXX: not html5 spec, fix
398 ERROR: unmatched-closing-tag-error stack tag ;
399
400 : unclosed-tag? ( obj -- ? )
401     { [ tag? ] [ end-tag>> not ] } 1&& ; inline
402
403 :: find-matching-tag ( name stack -- seq )
404     stack [ { [ unclosed-tag? ] [ name>> name = ] } 1&& ] find-last drop [
405         stack swap shorten*
406     ] [
407         stack name unmatched-closing-tag-error
408     ] if* ;
409
410 DEFER: tree-insert
411 GENERIC: tree-insert* ( document obj insertion-mode -- document )
412
413 : limited-quirks-mode? ( doctype -- ? )
414     {
415         [ public-identifier>> "-//W3C//DTD XHTML 1.0 Frameset//" head? ]
416         [ public-identifier>> "-//W3C//DTD XHTML 1.0 Transitional//" head? ]
417         [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Frameset//" head? ] } 1&& ]
418         [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Transitional//" head? ] } 1&& ]
419     } 1|| ;
420
421 ! https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode
422 M: initial-mode tree-insert*
423     drop {
424         { [ dup "\t\n\f\r\s" member? ] [ drop ] }
425         { [ dup doctype? ] [
426             >>tree-doctype before-html-mode >>insertion-mode
427         ] }
428         { [ dup comment? ] [ over tree>> push ] }
429         { [ dup tag? ] [ over tree>> push ] }
430         { [ dup end-tag? ] [
431             dup name>> pick tree>> find-matching-tag
432             unclip
433                 swap >>children
434                 swap >>end-tag
435             over tree>> push
436         ] }
437         [
438             over iframe-srcdoc?>> [
439                 over parser-cannot-change-mode-flag>> [
440                     [ t >>quirks-mode? ] dip
441                 ] unless
442             ] [
443                 "must be iframe-srcdoc here" throw
444             ] if
445             ! reprocess the token
446             before-html-mode >>insertion-mode tree-insert
447         ]
448     } cond ;
449
450 ! https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
451 M: before-html-mode tree-insert*
452     drop {
453         { [ dup doctype? ] [ drop ] }
454         { [ dup comment? ] [ over tree>> push ] }
455         { [ dup "\t\n\f\r\s" member? ] [ drop ] }
456         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
457             over tree>> push
458             before-head-mode >>insertion-mode
459         ] }
460         ! these tags are handled in the default case
461         { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
462             ! error end-tag, ignore
463             drop
464         ] }
465         [
466             ! Create missing html tag and reprocess the token
467             <tag> "html" >>name pick tree>> push
468             before-head-mode >>insertion-mode tree-insert
469         ]
470     } cond ;
471
472 M: before-head-mode tree-insert*
473     drop {
474         { [ dup "\t\n\f\r\s" member? ] [ drop ] }
475         { [ dup comment? ] [ over tree>> push ] }
476         { [ dup doctype? ] [ drop ] }
477         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
478             ! XXX: in-body-mode rules here for html tag
479             ! B
480             ! over tree>> push
481             ! before-head-mode >>insertion-mode
482             "handle html in-body-mode here" throw
483         ] }
484         { [ dup { [ tag? ] [ name>> "head" = ] } 1&& ] [
485             [ swap tree>> push ]
486             [ >>head-element-pointer drop ]
487             [ drop in-head-mode >>insertion-mode ] 2tri
488         ] }
489         ! these tags are handled in the default case
490         { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
491             ! error end-tag, ignore
492             drop
493         ] }
494         ! ignore tag
495         { [ dup tag? ] [ drop ] }
496         [
497             ! Create missing html tag and reprocess the token
498             <tag>
499             [ "head" >>name pick tree>> push ]
500             [ >>head-element-pointer ] bi
501             in-head-mode >>insertion-mode tree-insert
502         ]
503     } cond ;
504
505 M: in-head-mode tree-insert*
506     drop {
507         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
508         { [ dup comment? ] [ over tree>> push ] }
509         { [ dup doctype? ] [ drop ] }
510         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
511             ! XXX: in-body-mode rules here for html tag
512             ! B
513             ! over tree>> push
514             ! before-head-mode >>insertion-mode
515             "handle html in-body-mode here" throw
516             unimplemented*
517         ] }
518         { [ dup { [ tag? ] [ name>> { "base" "basefont" "bgsound" "link" } member? ] } 1&& ] [
519             ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
520             unimplemented*
521         ] }
522         { [ dup { [ tag? ] [ name>> "meta" = ] } 1&& ] [
523             unimplemented*
524         ] }
525         { [ dup { [ tag? ] [ name>> "title" = ] } 1&& ] [
526             ! https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm
527             "insert title node" throw
528             unimplemented*
529         ] }
530         { [
531             dup {
532                 [ { [ tag? ] [ name>> "noscript" = ] [ scripting?>> ] } 1&& ]
533                 [ { [ tag? ] [ name>> { "noframes" "style" } member? ] } 1&& ]
534             } 1||
535         ] [
536             ! https://html.spec.whatwg.org/multipage/parsing.html#generic-raw-text-element-parsing-algorithm
537             unimplemented*
538         ] }
539         { [ dup { [ tag? ] [ name>> "noscript" = ] [ scripting?>> not ] } 1&& ] [
540             unimplemented*
541             over tree>> push
542             in-head-noscript-mode >>insertion-mode
543         ] }
544         { [ dup { [ tag? ] [ name>> "script" = ] } 1&& ] [
545             unimplemented*
546             text-mode >>insertion-mode
547         ] }
548         { [ dup { [ end-tag? ] [ name>> "head" = ] } 1&& ] [
549             over tree>> last end-tag<<
550             after-head-mode >>insertion-mode
551         ] }
552         { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? ] } 1&& ] [
553             ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
554             unimplemented*
555         ] }
556         { [ dup { [ tag? ] [ name>> "template" = ] } 1&& ] [
557             unimplemented*
558             in-template-mode >>insertion-mode
559         ] }
560         { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
561             unimplemented*
562         ] }
563         ! XXX: revisit this
564         { [ dup {
565             [ { [ tag? ] [ name>> "head" = ] } 1&& ]
566             [ end-tag? ]
567             } 1|| ] [ drop "ignore here" throw ] }
568         [
569             ! end head tag should be here, pop off, reprocess
570             over tree>> pop swap >>end-tag
571             after-head-mode >>insertion-mode "omg" throw
572         ]
573     } cond ;
574
575 M: in-head-noscript-mode tree-insert* drop unimplemented* ;
576
577 M: after-head-mode tree-insert*
578     drop {
579         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
580         { [ dup comment? ] [ over tree>> push ] }
581         { [ dup doctype? ] [ drop ] }
582         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
583             ! XXX: in-body-mode rules here for html tag
584             ! B
585             ! over tree>> push
586             ! before-head-mode >>insertion-mode
587             "handle html in-body-mode here" throw
588             unimplemented*
589         ] }
590         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
591             unimplemented*
592         ] }
593         { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [
594             over tree>> push
595             f >>frameset-ok?
596             in-body-mode >>insertion-mode
597         ] }
598         { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [
599             unimplemented*
600         ] }
601         { [ dup { [ tag? ] [
602             name>> {
603                 "base" "basefont" "bgsound" "link" "meta"
604                 "noframes" "script" "style" "template" "title"
605             } member? ] } 1&&
606         ] [
607             unimplemented*
608         ] }
609         { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
610             unimplemented*
611         ] }
612         ! same as default case
613         ! { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ] [
614         !     unimplemented*
615         ! ] }
616         { [
617             dup {
618                 [ { [ tag? ] [ name>> "head" = ] } 1&& ]
619                 [ { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ]
620             } 1||
621         ] [
622             "omg revisit this" throw
623             unimplemented*
624         ] }
625         [
626             B
627             <tag> "body" >>name pick tree>> push
628             in-body-mode >>insertion-mode tree-insert
629         ]
630     } cond ;
631
632 M: in-body-mode tree-insert*
633     drop {
634         { [ dup CHAR: \0 = ] [ drop ] }
635         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
636         { [ dup comment? ] [ over tree>> push ] }
637         { [ dup doctype? ] [ drop ] }
638         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ drop ] }
639         { [ dup {
640             [
641                 {
642                     [ tag? ]
643                     [
644                         name>> {
645                             "base" "basefont" "bgsound" "link" "meta"
646                             "noframes" "script" "style" "template" "title"
647                         } member?
648                     ]
649                 } 1&&
650             ] [
651                 { [ end-tag? ] [ name>> "template" = ] } 1&&
652             ] } 1||
653         ] [
654             unimplemented*
655         ] }
656         ! XXX: parse error
657         { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [ drop unimplemented* ] }
658         { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [ drop unimplemented* ] }
659         ! XXX: eof
660         ! { [ ] [ ] }
661         { [ dup { [ end-tag? ] [ name>> "body" = ] } 1&& ] [
662             "body" pick tree>> find-matching-tag
663             unclip
664                 swap >>children
665                 swap >>end-tag
666             over tree>> push
667
668             after-body-mode >>insertion-mode
669         ] }
670         { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [ drop unimplemented* ] }
671         ! { [ ] [ ] }
672         [
673             unimplemented*
674         ]
675     } cond ;
676
677 M: text-mode tree-insert* drop unimplemented* ;
678 M: in-table-mode tree-insert* drop unimplemented* ;
679 M: in-table-text-mode tree-insert* drop unimplemented* ;
680 M: in-caption-mode tree-insert* drop unimplemented* ;
681 M: in-column-group-mode tree-insert* drop unimplemented* ;
682 M: in-table-body-mode tree-insert* drop unimplemented* ;
683 M: in-row-mode tree-insert* drop unimplemented* ;
684 M: in-cell-mode tree-insert* drop unimplemented* ;
685 M: in-select-mode tree-insert* drop unimplemented* ;
686 M: in-select-in-table-mode tree-insert* drop unimplemented* ;
687 M: in-template-mode tree-insert* drop unimplemented* ;
688 M: after-body-mode tree-insert*
689     drop {
690         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
691         { [ dup comment? ] [ over tree>> push ] }
692         { [ dup doctype? ] [ drop ] }
693         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
694             unimplemented*
695         ] }
696         { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [
697             ! XXX: make this a function
698             "html" pick tree>> find-matching-tag
699             unclip
700                 swap >>children
701                 swap >>end-tag
702             over tree>> push
703
704             after-after-body-mode >>insertion-mode
705         ] }
706         [
707             unimplemented*
708         ]
709     } cond ;
710 M: in-frameset-mode tree-insert* drop unimplemented* ;
711 M: after-frameset-mode tree-insert* drop unimplemented* ;
712
713 M: after-after-body-mode tree-insert*
714     drop {
715         { [ dup comment? ] [ over tree>> push ] }
716         { [ dup doctype? ] [ unimplemented*  ] }
717         { [ dup "\t\n\f\r\s" member? ] [ unimplemented*  ] }
718         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ unimplemented* ] }
719         ! eof
720         { [ dup f = ] [ drop ] }
721         [
722             ! XXX: parse error
723             [ in-body-mode >>insertion-mode ] dip tree-insert
724         ]
725     } cond ;
726
727 M: after-after-frameset-mode tree-insert* drop unimplemented* ;
728
729 : tree-insert ( document obj -- document )
730     over insertion-mode>> tree-insert* ;
731
732 MEMO: load-entities ( -- assoc )
733     "vocab:html5/entities.json" utf8 file-contents json> ;
734
735 MEMO: entities-suffix-array ( -- assoc )
736     load-entities keys >suffix-array ;
737
738 : lookup-entity ( string -- entity/string ? )
739     load-entities ?at ;
740
741 : named-character-match? ( document -- prefix? exact? )
742     temporary-buffer>>
743     [ entities-suffix-array query f like ]
744     [ last CHAR: ; = ] bi ;
745
746 ERROR: unknown-named-entity entity ;
747 : take-named-character ( document -- )
748     dup
749     temporary-buffer>> >string lookup-entity [
750         "characters" of
751         SBUF" " clone-like >>temporary-buffer drop
752     ] [
753         unknown-named-entity
754     ] if ;
755
756 ! XXX: remove the tag>> name>> push part
757 : push-tag-name ( ch document -- )
758     [ tag>> name>> push ]
759     [
760         2drop ! tag-name>> push
761     ] 2bi ;
762 : push-attribute-name ( ch document -- ) attribute-name>> push ;
763 : push-attribute-value ( ch document -- ) attribute-value>> push ;
764 : push-comment-token ( ch document -- ) comment-token>> push ;
765 : push-all-comment-token ( string document -- ) comment-token>> push-all ;
766
767 ERROR: invalid-return-state obj ;
768 : check-return-state ( obj -- return-state )
769     dup word? [ invalid-return-state ] unless ;
770
771 : current-attribute ( document -- attribute/f )
772     [ attribute-name>> >string f like ]
773     [ attribute-value>> >string f like ] bi
774     2dup or [ 2array ] [ 2drop f ] if ;
775
776 : push-when ( obj/f seq -- )
777     over [ push ] [ 2drop ] if ; inline
778
779 : reset-attribute ( document -- )
780     SBUF" " clone >>attribute-name
781     SBUF" " clone >>attribute-value drop ;
782
783 : push-attribute ( document -- )
784     [ current-attribute ]
785     [ tag>> attributes>> push-when ]
786     [ reset-attribute ] tri ;
787
788 : emit-eof ( document -- )
789     "emit-eof" print
790     f tree-insert drop ;
791 : emit-char ( char document -- ) drop "emit-char: " write 1string . ;
792 : emit-string ( char document -- ) drop "emit-string: " write . ;
793 : emit-tag ( document -- )
794     "emit-tag: " write
795     {
796         [ tag>> [ name>> >string ] [ name<< ] bi ]
797         [ push-attribute ]
798         [ tag>> . ]
799         [ dup tag>> tree-insert drop ]
800         [ f >>tag drop ]
801     } cleave ;
802 : emit-end-tag ( document -- )
803     "emit-end-tag: " write
804     [ tag>> . ]
805     [ f >>tag drop ] bi ;
806 : emit-comment-token ( document -- )
807     "emit-comment-token: " write
808     {
809         [ comment-token>> >string . ]
810         [ dup comment-token>> >string <comment> tree-insert drop ]
811         [ SBUF" " clone >>comment-token drop ]
812     } cleave ;
813 : emit-doctype ( document -- )
814     "emit-doctype: " write dup doctype>> .
815     {
816         [ doctype>> [ >string ] change-name drop ]
817         [
818             ! XXX: handle iframe srcdoc document
819             dup { [ doctype>> name>> "html" = not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
820                 t >>quirks-mode?
821             ] [
822                 dup { [ iframe-srcdoc?>> not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
823                     dup doctype>> limited-quirks-mode? [ t >>limited-quirks-mode? ] when
824                 ] when
825             ] if
826             drop
827         ]
828         [ dup doctype>> tree-insert drop ]
829         [ f >>doctype drop ]
830     } cleave ;
831
832 : reset-temporary-buffer ( document -- ) SBUF" " clone temporary-buffer<< ;
833 : ch>new-temporary-buffer ( ch document -- ) [ 1sbuf ] dip temporary-buffer<< ;
834 : string>new-temporary-buffer ( string document -- ) [ SBUF" " clone-like ] dip temporary-buffer<< ;
835 : temporary-buffer-last ( document -- ch/f ) temporary-buffer>> ?last ;
836 : push-temporary-buffer ( ch document -- ) temporary-buffer>> push ;
837 : push-all-temporary-buffer ( string document -- ) temporary-buffer>> push-all ;
838
839 : flush-temporary-buffer ( document -- )
840     "flush-temporary-buffer: " write
841     [ [ temporary-buffer>> ] keep [ emit-char ] curry each ]
842     [ SBUF" " clone >>temporary-buffer drop ] bi ;
843
844 : emit-temporary-buffer-with ( string document -- )
845     [ temporary-buffer>> push-all ]
846     [ flush-temporary-buffer ] bi ;
847
848 ! check if matches open tag
849 : appropriate-end-tag-token? ( document -- ? )
850     drop f ;
851
852 : ascii-upper-alpha? ( ch -- ? ) [ CHAR: A CHAR: Z between? ] [ f ] if* ; inline
853 : ascii-lower-alpha? ( ch -- ? ) [ CHAR: a CHAR: z between? ] [ f ] if* ; inline
854 : ascii-upper-hex-digit? ( ch -- ? ) [ CHAR: A CHAR: F between? ] [ f ] if* ; inline
855 : ascii-lower-hex-digit? ( ch -- ? ) [ CHAR: a CHAR: f between? ] [ f ] if* ; inline
856 : ascii-hex-alpha? ( ch -- ? ) { [ ascii-upper-hex-digit? ] [ ascii-lower-hex-digit? ] } 1|| ; inline
857
858 : ascii-digit? ( ch/f -- ? ) [ CHAR: 0 CHAR: 9 between? ] [ f ] if* ;
859 : ascii-alpha? ( ch/f -- ? ) { [ ascii-lower-alpha? ] [ ascii-upper-alpha? ] } 1|| ;
860 : ascii-alphanumeric? ( ch/f -- ? ) { [ ascii-alpha? ] [ ascii-digit? ] } 1|| ;
861 : ascii-hex-digit? ( ch/f -- ? ) { [ ascii-digit? ] [ ascii-hex-alpha? ] } 1|| ;
862
863 : (return-state) ( document n/f string ch/f -- document n'/f string )
864     reach [ f ] change-return-state drop check-return-state
865     execute( document n/f string ch/f -- document n'/f string ) ;
866
867 : return-state ( document n/f string -- document n'/f string )
868     pick [ f ] change-return-state drop check-return-state
869     execute( document n/f string -- document n'/f string ) ;
870
871 : (data-state) ( document n/f string ch/f -- document n'/f string )
872     {
873         { [ dup CHAR: & = ] [ drop [ \ data-state >>return-state ] 2dip character-reference-state ] }
874         { [ dup CHAR: < = ] [ drop tag-open-state ] }
875         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
876         { [ dup f = ] [ drop pick emit-eof ] }
877         [ reach emit-char data-state ]
878     } cond ;
879
880 : data-state ( document n/f string -- document n'/f string )
881     take-char (data-state) ;
882
883
884 : (rcdata-state) ( document n/f string ch/f -- document n'/f string )
885     {
886         { [ dup CHAR: & = ] [ drop [ \ rcdata-state >>return-state ] 2dip character-reference-state ] }
887         { [ dup CHAR: < = ] [ drop rcdata-less-than-sign-state ] }
888         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
889         { [ dup f = ] [ drop pick emit-eof ] }
890         [ reach emit-char rcdata-state ]
891     } cond ;
892
893 : rcdata-state ( document n/f string -- document n'/f string )
894     take-char (rcdata-state) ;
895
896
897 : (rawtext-state) ( document n/f string ch/f -- document n'/f string )
898     {
899         { [ dup CHAR: < = ] [ drop rawtext-less-than-sign-state ] }
900         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
901         { [ dup f = ] [ drop pick emit-eof ] }
902         [ reach emit-char rawtext-state ]
903     } cond ;
904
905 : rawtext-state ( document n/f string -- document n'/f string )
906     take-char (rawtext-state) ;
907
908
909 : (script-data-state) ( document n/f string ch/f -- document n'/f string )
910     {
911         { [ dup CHAR: < = ] [ drop script-data-less-than-sign-state ] }
912         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
913         { [ dup f = ] [ drop pick emit-eof ] }
914         [ reach emit-char script-data-state ]
915     } cond ;
916
917 : script-data-state ( document n/f string -- document n'/f string )
918     take-char (script-data-state) ;
919
920
921 : (plaintext-state) ( document n/f string ch/f -- document n'/f string )
922     {
923         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
924         { [ dup f = ] [ drop pick emit-eof ] }
925         [ reach emit-char plaintext-state ]
926     } cond ;
927
928 : plaintext-state ( document n/f string -- document n'/f string )
929     take-char (plaintext-state) ;
930
931
932 : (tag-open-state) ( document n/f string ch/f -- document n'/f string )
933     {
934         { [ dup ascii-alpha? ] [ reach new-tag (tag-name-state) ] }
935         { [ dup CHAR: ! = ] [ drop markup-declaration-open-state ] }
936         { [ dup CHAR: / = ] [ drop end-tag-open-state ] }
937         { [ dup CHAR: ? = ] [ unexpected-question-mark-instead-of-tag-name ] }
938         { [ dup f = ] [ eof-before-tag-name ] }
939         [ invalid-first-character-of-tag-name ]
940     } cond ;
941
942 : tag-open-state ( document n/f string -- document n'/f string )
943     take-char (tag-open-state) ;
944
945
946 : (end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
947     {
948         { [ dup ascii-alpha? ] [ reach new-end-tag (tag-name-state) ] }
949         { [ dup CHAR: > = ] [ missing-end-tag-name ] }
950         { [ dup f = ] [ eof-before-tag-name ] }
951         [ invalid-first-character-of-tag-name ]
952     } cond ;
953
954 : end-tag-open-state ( document n/f string -- document n'/f string )
955     take-char (end-tag-open-state) ;
956
957
958 : (tag-name-state) ( document n/f string ch/f -- document n'/f string )
959     {
960         { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-tag-name tag-name-state ] }
961         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
962         { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
963         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
964         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
965         { [ dup f = ] [ eof-before-tag-name ] }
966         [ reach push-tag-name tag-name-state ]
967     } cond ;
968
969 : tag-name-state ( document n/f string -- document n'/f string )
970     take-char (tag-name-state) ;
971
972
973 : (rcdata-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
974     {
975         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rcdata-end-tag-open-state ] }
976         [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
977     } cond ;
978
979 : rcdata-less-than-sign-state ( document n/f string -- document n'/f string )
980     take-char (rcdata-less-than-sign-state) ;
981
982
983 : (rcdata-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
984     {
985         { [ dup ascii-alpha? ] [ reach new-end-tag (rcdata-end-tag-name-state) ] }
986         [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
987     } cond ;
988
989 : rcdata-end-tag-open-state ( document n/f string -- document n'/f string )
990     take-char (rcdata-end-tag-open-state) ;
991
992
993 : (rcdata-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
994     {
995         { [ dup "\t\n\f\s" member? ] [
996             drop pick appropriate-end-tag-token?
997             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
998         ] }
999         { [ dup CHAR: / = ] [
1000             drop pick appropriate-end-tag-token?
1001             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
1002         ] }
1003         { [ dup CHAR: > = ] [
1004             drop pick appropriate-end-tag-token?
1005             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
1006         ] }
1007         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
1008         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
1009         [ [ "</" reach emit-temporary-buffer-with ] dip (rcdata-state) ]
1010     } cond ;
1011
1012 : rcdata-end-tag-name-state ( document n/f string -- document n'/f string )
1013     take-char (rcdata-end-tag-name-state) ;
1014
1015
1016 : (rawtext-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1017     {
1018         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rawtext-end-tag-open-state ] }
1019         [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
1020     } cond ;
1021
1022 : rawtext-less-than-sign-state ( document n/f string -- document n'/f string )
1023     take-char (rawtext-less-than-sign-state) ;
1024
1025
1026 : (rawtext-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1027     {
1028         { [ dup ascii-alpha? ] [ reach new-end-tag (rawtext-end-tag-name-state) ] }
1029         [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
1030     } cond ;
1031
1032 : rawtext-end-tag-open-state ( document n/f string -- document n'/f string )
1033     take-char (rawtext-end-tag-open-state) ;
1034
1035
1036 : (rawtext-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1037     {
1038         { [ dup "\t\n\f\s" member? ] [
1039             drop pick appropriate-end-tag-token?
1040             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1041         ] }
1042         { [ dup CHAR: / = ] [
1043             drop pick appropriate-end-tag-token?
1044             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1045         ] }
1046         { [ dup CHAR: > = ] [
1047             drop pick appropriate-end-tag-token?
1048             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1049         ] }
1050         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1051         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1052         [ [ "</" reach emit-temporary-buffer-with ] dip (rawtext-state) ]
1053     } cond ;
1054
1055 : rawtext-end-tag-name-state ( document n/f string -- document n'/f string )
1056     take-char (rawtext-end-tag-name-state) ;
1057
1058
1059 : (script-data-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1060     {
1061         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-end-tag-open-state ] }
1062         { [ dup CHAR: ! = ] [ drop "<!" reach emit-string script-data-escape-start-state ] }
1063         [ [ CHAR: < reach emit-char ] dip (script-data-state) ]
1064     } cond ;
1065
1066 : script-data-less-than-sign-state ( document n/f string -- document n'/f string )
1067     take-char (script-data-less-than-sign-state) ;
1068
1069
1070 : (script-data-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1071     {
1072         { [ dup ascii-alpha? ] [ reach new-end-tag (script-data-end-tag-name-state) ] }
1073         [ [ "</" reach emit-string ] dip (script-data-state) ]
1074     } cond ;
1075
1076 : script-data-end-tag-open-state ( document n/f string -- document n'/f string )
1077     take-char (script-data-end-tag-open-state) ;
1078
1079
1080 : (script-data-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1081     {
1082         { [ dup "\t\n\f\s" member? ] [
1083             drop pick appropriate-end-tag-token?
1084             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1085         ] }
1086         { [ dup CHAR: / = ] [
1087             drop pick appropriate-end-tag-token?
1088             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1089         ] }
1090         { [ dup CHAR: > = ] [
1091             drop pick appropriate-end-tag-token?
1092             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1093         ] }
1094         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1095         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1096         [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-state) ]
1097     } cond ;
1098
1099 : script-data-end-tag-name-state ( document n/f string -- document n'/f string )
1100     take-char (script-data-end-tag-name-state) ;
1101
1102
1103 : (script-data-escape-start-state) ( document n/f string ch/f -- document n'/f string )
1104     {
1105         { [ dup CHAR: - = ] [ drop script-data-escape-start-dash-state ] }
1106         [ (script-data-state) ]
1107     } cond ;
1108
1109 : script-data-escape-start-state ( document n/f string -- document n'/f string )
1110     take-char (script-data-escape-start-state) ;
1111
1112
1113 : (script-data-escape-start-dash-state) ( document n/f string ch/f -- document n'/f string )
1114     {
1115         { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
1116         [ (script-data-state) ]
1117     } cond ;
1118
1119 : script-data-escape-start-dash-state ( document n/f string -- document n'/f string )
1120     take-char (script-data-escape-start-dash-state) ;
1121
1122
1123 : (script-data-escaped-state) ( document n/f string ch/f -- document n'/f string )
1124     {
1125         { [ dup CHAR: - = ] [ drop script-data-escaped-dash-state ] }
1126         { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1127         { [ dup CHAR: \0 = ] [ unexpected-null-character CHAR: replacement-character unimplemented* ] }
1128         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1129         [ reach emit-char script-data-escaped-state ]
1130     } cond ;
1131
1132 : script-data-escaped-state ( document n/f string -- document n'/f string )
1133     take-char (script-data-escaped-state) ;
1134
1135
1136 : (script-data-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
1137     {
1138         { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
1139         { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1140         { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
1141         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1142         [ reach emit-char script-data-escaped-state ]
1143     } cond ;
1144
1145 : script-data-escaped-dash-state ( document n/f string -- document n'/f string )
1146     take-char (script-data-escaped-dash-state) ;
1147
1148
1149 : (script-data-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1150     {
1151         { [ dup CHAR: - = ] [ reach emit-char script-data-escaped-dash-dash-state ] }
1152         { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1153         { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
1154         { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
1155         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1156         [ reach emit-char script-data-escaped-state ]
1157     } cond ;
1158
1159 : script-data-escaped-dash-dash-state ( document n/f string -- document n'/f string )
1160     take-char (script-data-escaped-dash-dash-state) ;
1161
1162
1163 : (script-data-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1164     {
1165         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-escaped-end-tag-open-state ] }
1166         { [ dup ascii-alpha? ] [ [ pick reset-temporary-buffer CHAR: < reach emit-char ] dip (script-data-double-escape-start-state) ] }
1167         [ [ CHAR: < reach emit-char ] dip (script-data-escaped-state) ]
1168     } cond ;
1169
1170 : script-data-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
1171     take-char (script-data-escaped-less-than-sign-state) ;
1172
1173
1174 : (script-data-escaped-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1175     {
1176         { [ dup ascii-alpha? ] [ [ pick new-end-tag ] dip (script-data-escaped-end-tag-name-state) ] }
1177         [ [ "</" reach emit-string ] dip (script-data-escaped-state) ]
1178     } cond ;
1179
1180 : script-data-escaped-end-tag-open-state ( document n/f string -- document n'/f string )
1181     take-char (script-data-escaped-end-tag-open-state) ;
1182
1183
1184 : (script-data-escaped-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1185     {
1186         { [ dup "\t\n\f\s" member? ] [
1187             drop pick appropriate-end-tag-token?
1188             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1189         ] }
1190         { [ dup CHAR: / = ] [
1191             drop pick appropriate-end-tag-token?
1192             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1193         ] }
1194         { [ dup CHAR: > = ] [
1195             drop pick appropriate-end-tag-token?
1196             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1197         ] }
1198         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
1199         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
1200         [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-escaped-state) ]
1201     } cond ;
1202
1203 : script-data-escaped-end-tag-name-state ( document n/f string -- document n'/f string )
1204     take-char (script-data-escaped-end-tag-name-state) ;
1205
1206
1207 : (script-data-double-escape-start-state) ( document n/f string ch/f -- document n'/f string )
1208     {
1209         { [ dup "\t\n\f\s/>" member? ] [
1210             reach emit-char
1211             pick temporary-buffer>> "script" sequence=
1212             [ script-data-double-escaped-state ] [ script-data-escaped-state ] if
1213         ] }
1214         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] }
1215         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] } ! todo
1216         [ (script-data-escaped-state) ]
1217     } cond ;
1218
1219 : script-data-double-escape-start-state ( document n/f string -- document n'/f string )
1220     take-char (script-data-double-escape-start-state) ;
1221
1222
1223 : (script-data-double-escaped-state) ( document n/f string ch/f -- document n'/f string )
1224     {
1225         { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-state ] }
1226         { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1227         { [ dup CHAR: \0 = ] [
1228             unexpected-null-character
1229             CHAR: replacement-character reach emit-char
1230             script-data-double-escaped-state
1231         ] }
1232         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1233         [ reach emit-char script-data-double-escaped-state ]
1234     } cond ;
1235
1236 : script-data-double-escaped-state ( document n/f string -- document n'/f string )
1237     take-char (script-data-double-escaped-state) ;
1238
1239
1240 : (script-data-double-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
1241     {
1242         { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
1243         { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1244         { [ dup CHAR: \0 = ] [
1245             unexpected-null-character
1246             CHAR: replacement-character reach emit-char
1247             script-data-double-escaped-state
1248         ] }
1249         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1250         [ reach emit-char script-data-double-escaped-state ]
1251     } cond ;
1252
1253 : script-data-double-escaped-dash-state ( document n/f string -- document n'/f string )
1254     take-char (script-data-double-escaped-dash-state) ;
1255
1256
1257 : (script-data-double-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1258     {
1259         { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
1260         { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1261         { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
1262         { [ dup CHAR: \0 = ] [
1263             unexpected-null-character
1264             CHAR: replacement-character reach emit-char
1265             script-data-double-escaped-state
1266         ] }
1267         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1268         [ reach emit-char script-data-escaped-state ]
1269     } cond ;
1270
1271 : script-data-double-escaped-dash-dash-state ( document n/f string -- document n'/f string )
1272     take-char (script-data-double-escaped-dash-dash-state) ;
1273
1274
1275 : (script-data-double-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1276     {
1277         { [ dup CHAR: / = ] [ reach emit-char pick reset-temporary-buffer script-data-double-escape-end-state ] }
1278         [ (script-data-double-escaped-state) ]
1279     } cond ;
1280
1281 : script-data-double-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
1282     take-char (script-data-double-escaped-less-than-sign-state) ;
1283
1284
1285 : (script-data-double-escape-end-state) ( document n/f string ch/f -- document n'/f string )
1286     {
1287         { [ dup "\t\n\f\s/>" member? ] [
1288             reach emit-char
1289             pick temporary-buffer>> "script" sequence=
1290             [ script-data-escaped-state ] [ script-data-double-escaped-state ] if
1291         ] }
1292         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] }
1293         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] } ! todo
1294         [ (script-data-double-escaped-state) ]
1295     } cond ;
1296
1297 : script-data-double-escape-end-state ( document n/f string -- document n'/f string )
1298     take-char (script-data-double-escape-end-state) ;
1299
1300
1301 : (before-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1302     {
1303         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1304         { [ dup "/>" member? ] [ (after-attribute-name-state) ] }
1305         { [ dup f = ] [ (after-attribute-name-state) ] }
1306         { [ dup CHAR: = = ] [ unexpected-equals-sign-before-attribute-name ] }
1307         [ reach push-attribute (attribute-name-state) ]
1308     } cond ;
1309
1310 : before-attribute-name-state ( document n/f string -- document n'/f string )
1311     take-char (before-attribute-name-state) ;
1312
1313
1314 : (attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1315     {
1316         { [ dup "\t\n\f\s/>" member? ] [ (after-attribute-name-state) ] }
1317         { [ dup f = ] [ (after-attribute-name-state) ] }
1318         { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
1319         { [ dup ascii-upper-alpha? ] [
1320             0x20 + reach push-attribute-name
1321             attribute-name-state
1322         ] }
1323         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
1324         { [ dup "\"'<" member? ] [
1325             unexpected-character-in-attribute-name
1326             reach push-attribute-name attribute-name-state
1327         ] }
1328         [ reach push-attribute-name attribute-name-state ]
1329     } cond ;
1330
1331 : attribute-name-state ( document n/f string -- document n'/f string )
1332     take-char (attribute-name-state) ;
1333
1334
1335 : (after-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1336     {
1337         { [ dup "\t\n\f\s" member? ] [ drop after-attribute-name-state ] }
1338         { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
1339         { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
1340         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1341         { [ dup f = ] [ eof-in-tag ] }
1342         [ [ pick push-attribute ] dip (attribute-name-state) ]
1343     } cond ;
1344
1345 : after-attribute-name-state ( document n/f string -- document n'/f string )
1346     take-char (after-attribute-name-state) ;
1347
1348
1349 : (before-attribute-value-state) ( document n/f string ch/f -- document n'/f string )
1350     {
1351         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1352         { [ dup CHAR: " = ] [ drop attribute-value-double-quoted-state ] }
1353         { [ dup CHAR: ' = ] [ drop attribute-value-single-quoted-state ] }
1354         { [ dup CHAR: > = ] [ drop missing-attribute-value ] }
1355         [ (attribute-value-unquoted-state) ]
1356     } cond ;
1357
1358 : before-attribute-value-state ( document n/f string -- document n'/f string )
1359     take-char (before-attribute-value-state) ;
1360
1361
1362 : (attribute-value-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1363     {
1364         { [ dup CHAR: " = ] [ drop after-attribute-value-quoted-state ] }
1365         { [ dup CHAR: & = ] [
1366             drop
1367             [ \ attribute-value-double-quoted-state >>return-state ] 2dip character-reference-state
1368         ] }
1369         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
1370         { [ dup f = ] [ eof-in-tag ] }
1371         [ reach push-attribute-value attribute-value-double-quoted-state ]
1372     } cond ;
1373
1374 : attribute-value-double-quoted-state ( document n/f string -- document n'/f string )
1375     take-char (attribute-value-double-quoted-state) ;
1376
1377
1378 : (attribute-value-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1379     {
1380         { [ dup CHAR: ' = ] [ drop after-attribute-value-quoted-state ] }
1381         { [ dup CHAR: & = ] [
1382             drop [ \ attribute-value-single-quoted-state >>return-state ] 2dip
1383             character-reference-state
1384         ] }
1385         { [ dup CHAR: \0 = ] [
1386             drop unexpected-null-character
1387             CHAR: replacement-character reach push-attribute-value
1388         ] }
1389         { [ dup f = ] [ eof-in-tag ] }
1390         [ reach push-attribute-value attribute-value-single-quoted-state ]
1391     } cond ;
1392
1393 : attribute-value-single-quoted-state ( document n/f string -- document n'/f string )
1394     take-char (attribute-value-single-quoted-state) ;
1395
1396
1397 : (attribute-value-unquoted-state) ( document n/f string ch/f -- document n'/f string )
1398     {
1399         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1400         { [ dup CHAR: & = ] [
1401             drop
1402             [ \ attribute-value-unquoted-state >>return-state ] 2dip character-reference-state
1403         ] }
1404         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1405         { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-attribute-value ] }
1406         { [ dup "\"'<=`" member? ] [
1407             unexpected-character-in-unquoted-attribute-value
1408             reach push-attribute-value
1409             attribute-value-unquoted-state
1410         ] }
1411         { [ dup f = ] [ eof-in-tag ] }
1412         [ reach push-attribute-value attribute-value-unquoted-state ]
1413     } cond ;
1414
1415 : attribute-value-unquoted-state ( document n/f string -- document n'/f string )
1416     take-char (attribute-value-unquoted-state) ;
1417
1418
1419 : (after-attribute-value-quoted-state) ( document n/f string ch/f -- document n'/f string )
1420     {
1421         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1422         { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
1423         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1424         { [ dup f = ] [ eof-in-tag ] }
1425         [ missing-whitespace-between-attributes (before-attribute-name-state) ]
1426     } cond ;
1427
1428 : after-attribute-value-quoted-state ( document n/f string -- document n'/f string )
1429     take-char (after-attribute-value-quoted-state) ;
1430
1431
1432 : (self-closing-start-tag-state) ( document n/f string ch/f -- document n'/f string )
1433     {
1434         { [ dup CHAR: > = ] [ drop pick [ set-self-closing ] [ emit-tag ] bi data-state ] }
1435         { [ dup f = ] [ eof-in-tag ] }
1436         [ unexpected-solidus-in-tag ]
1437     } cond ;
1438
1439 : self-closing-start-tag-state ( document n/f string -- document n'/f string )
1440     take-char (self-closing-start-tag-state) ;
1441
1442
1443 : (bogus-comment-state) ( document n/f string ch/f -- document n'/f string )
1444     {
1445         { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
1446         { [ dup f = ] [ drop pick [ emit-comment-token ] [ emit-eof ] bi ] }
1447         { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-comment-token ] }
1448         [ reach push-comment-token bogus-comment-state ]
1449     } cond ;
1450
1451 : bogus-comment-state ( document n/f string -- document n'/f string )
1452     take-char (bogus-comment-state) ;
1453
1454
1455 : markup-declaration-open-state ( document n/f string -- document n'/f string )
1456     {
1457         { [ "--" take-from? ] [ comment-start-state ] }
1458         { [ "DOCTYPE" take-from-insensitive? ] [ doctype-state ] }
1459         { [ "[CDATA[" take-from-insensitive? ] [ unimplemented* ] }
1460         [
1461             incorrectly-opened-comment ! bogus-comment-state
1462         ]
1463     } cond ;
1464
1465 : (comment-start-state) ( document n/f string ch/f -- document n'/f string )
1466     {
1467         { [ dup CHAR: - = ] [ drop comment-start-dash-state ] }
1468         { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment pick emit-comment-token data-state ] }
1469         [ (comment-state) ]
1470     } cond ;
1471
1472 : comment-start-state ( document n/f string -- document n'/f string )
1473     take-char (comment-start-state) ;
1474
1475
1476 : (comment-start-dash-state) ( document n/f string ch/f -- document n'/f string )
1477     {
1478         { [ dup CHAR: - = ] [ drop comment-end-state ] }
1479         { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment ] }
1480         { [ dup f = ] [ eof-in-comment ] }
1481         [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
1482     } cond ;
1483
1484 : comment-start-dash-state ( document n/f string -- document n'/f string )
1485     take-char (comment-start-dash-state) ;
1486
1487
1488 : (comment-state) ( document n/f string ch/f -- document n'/f string )
1489     {
1490         { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
1491         { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
1492         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
1493         { [ dup f = ] [ eof-in-comment ] }
1494         [ reach push-comment-token comment-state ]
1495     } cond ;
1496
1497 : comment-state ( document n/f string -- document n'/f string )
1498     take-char (comment-state) ;
1499
1500
1501 : (comment-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1502     {
1503         { [ dup CHAR: ! = ] [ reach push-comment-token comment-less-than-sign-bang-state ] }
1504         { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
1505         [ (comment-state) ]
1506     } cond ;
1507
1508 : comment-less-than-sign-state ( document n/f string -- document n'/f string )
1509     take-char (comment-less-than-sign-state) ;
1510
1511
1512 : (comment-less-than-sign-bang-state) ( document n/f string ch/f -- document n'/f string )
1513     {
1514         { [ dup CHAR: - = ] [ reach push-comment-token comment-less-than-sign-bang-dash-state ] }
1515         [ (comment-state) ]
1516     } cond ;
1517
1518 : comment-less-than-sign-bang-state ( document n/f string -- document n'/f string )
1519     take-char (comment-less-than-sign-bang-state) ;
1520
1521
1522 : (comment-less-than-sign-bang-dash-state) ( document n/f string ch/f -- document n'/f string )
1523     {
1524         { [ dup CHAR: - = ] [ drop comment-less-than-sign-bang-dash-dash-state ] }
1525         [ (comment-end-dash-state) ]
1526     } cond ;
1527
1528 : comment-less-than-sign-bang-dash-state ( document n/f string -- document n'/f string )
1529     take-char (comment-less-than-sign-bang-dash-state) ;
1530
1531
1532 : (comment-less-than-sign-bang-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1533     {
1534         { [ dup CHAR: > = ] [ (comment-end-state) ] }
1535         { [ dup f = ] [ (comment-end-state) ] }
1536         [ nested-comment (comment-end-state) ]
1537     } cond ;
1538
1539 : comment-less-than-sign-bang-dash-dash-state ( document n/f string -- document n'/f string )
1540     take-char (comment-less-than-sign-bang-dash-dash-state) ;
1541
1542
1543 : (comment-end-dash-state) ( document n/f string ch/f -- document n'/f string )
1544     {
1545         { [ dup CHAR: - = ] [ drop comment-end-state ] }
1546         { [ dup f = ] [ eof-in-comment ] }
1547         [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
1548     } cond ;
1549
1550 : comment-end-dash-state ( document n/f string -- document n'/f string )
1551     take-char (comment-end-dash-state) ;
1552
1553
1554 : (comment-end-state) ( document n/f string ch/f -- document n'/f string )
1555     {
1556         { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
1557         { [ dup CHAR: ! = ] [ drop comment-end-bang-state ] }
1558         { [ dup CHAR: - = ] [ reach push-comment-token comment-end-state ] }
1559         { [ dup f = ] [ drop eof-in-comment pick [ emit-comment-token ] [ emit-eof ] bi ] }
1560         [ [ "--" reach push-all-comment-token ] dip (comment-state) ]
1561     } cond ;
1562
1563 : comment-end-state ( document n/f string -- document n'/f string )
1564     take-char (comment-end-state) ;
1565
1566
1567 : (comment-end-bang-state) ( document n/f string ch/f -- document n'/f string )
1568     {
1569         { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
1570         { [ dup CHAR: > = ] [ drop incorrectly-closed-comment pick emit-comment-token data-state ] }
1571         { [ dup f = ] [ eof-in-comment ] }
1572         [ [ "--!" reach push-all-comment-token ] dip (comment-state) ]
1573     } cond ;
1574
1575 : comment-end-bang-state ( document n/f string -- document n'/f string )
1576     take-char (comment-end-bang-state) ;
1577
1578
1579 : (doctype-state) ( document n/f string ch/f -- document n'/f string )
1580     {
1581         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
1582         { [ dup CHAR: > = ] [ (before-doctype-name-state) ] }
1583         { [ dup f = ] [ drop eof-in-doctype pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1584         [ missing-whitespace-before-doctype-name ]
1585     } cond ;
1586
1587 : doctype-state ( document n/f string -- document n'/f string )
1588     take-char (doctype-state) ;
1589
1590
1591 : (before-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1592     {
1593         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
1594         { [ dup ascii-upper-alpha? ] [ 0x20 + reach new-doctype-from-ch doctype-name-state ] }
1595         { [ dup CHAR: \0 = ] [
1596             drop
1597             unexpected-null-character
1598             CHAR: replacement-character reach new-doctype-from-ch
1599             doctype-name-state
1600         ] }
1601         { [ dup CHAR: > = ] [
1602             drop missing-doctype-name
1603             pick [ new-doctype-with-quirks ] [ emit-doctype ] bi
1604         ] }
1605         { [ dup f = ] [
1606             drop eof-in-doctype
1607             pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri
1608         ] }
1609         [ reach new-doctype-from-ch doctype-name-state ]
1610     } cond ;
1611
1612 : before-doctype-name-state ( document n/f string -- document n'/f string )
1613     take-char (before-doctype-name-state) ;
1614
1615
1616 : (doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1617     {
1618         { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
1619         { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1620         { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-doctype-name doctype-name-state ] }
1621         { [ dup CHAR: \0 = ] [
1622             drop unexpected-null-character
1623             CHAR: replacement-character pick push-doctype-name
1624             doctype-name-state
1625         ] }
1626         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } ! force-quirks on for doctype
1627         [ reach push-doctype-name doctype-name-state ]
1628     } cond ;
1629
1630 : doctype-name-state ( document n/f string -- document n'/f string )
1631     take-char (doctype-name-state) ;
1632
1633
1634 : (after-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1635     {
1636         { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
1637         { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1638         { [ dup f = ] [ eof-in-doctype ] }
1639         { [ [ "PUBLIC" take-from-insensitive? ] dip swap ] [ drop after-doctype-public-keyword-state ] }
1640         { [ [ "SYSTEM" take-from-insensitive? ] dip swap ] [ drop after-doctype-system-keyword-state ] }
1641         [ invalid-character-sequence-after-doctype-name ]
1642     } cond ;
1643
1644 : after-doctype-name-state ( document n/f string -- document n'/f string )
1645     take-char (after-doctype-name-state) ;
1646
1647
1648 : (after-doctype-public-keyword-state) ( document n/f string ch/f -- document n'/f string )
1649     {
1650         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
1651         { [ dup CHAR: " = ] [ missing-whitespace-after-doctype-public-keyword ] }
1652         { [ dup CHAR: ' = ] [ missing-whitespace-after-doctype-public-keyword ] }
1653         { [ dup CHAR: > = ] [ drop missing-doctype-public-identifier force-quirks data-state ] }
1654         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1655         [
1656             missing-quote-before-doctype-public-identifier
1657             [ reach force-quirks ] dip
1658             (bogus-doctype-state)
1659         ]
1660     } cond ;
1661
1662 : after-doctype-public-keyword-state ( document n/f string -- document n'/f string )
1663     take-char (after-doctype-public-keyword-state) ;
1664
1665
1666 : (before-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
1667     {
1668         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
1669         { [ dup CHAR: " = ] [
1670             drop pick initialize-doctype-public-identifier
1671             doctype-public-identifier-double-quoted-state
1672         ] }
1673         { [ dup CHAR: ' = ] [
1674             drop pick initialize-doctype-public-identifier
1675             doctype-public-identifier-single-quoted-state
1676         ] }
1677         { [ dup CHAR: > = ] [
1678             drop missing-doctype-public-identifier
1679             pick [ force-quirks ] [ emit-doctype ] bi
1680             data-state
1681         ] }
1682         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1683         [
1684             missing-quote-before-doctype-public-identifier
1685             [ reach force-quirks ] dip
1686             (bogus-doctype-state)
1687         ]
1688     } cond ;
1689
1690 : before-doctype-public-identifier-state ( document n/f string -- document n'/f string )
1691     take-char (before-doctype-public-identifier-state) ;
1692
1693
1694 : (doctype-public-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1695     {
1696         { [ dup CHAR: " = ] [ drop after-doctype-public-identifier-state ] }
1697         { [ dup CHAR: \0 = ] [
1698             drop
1699             unexpected-null-character
1700             CHAR: replacement-character pick push-doctype-public-identifier
1701             doctype-public-identifier-double-quoted-state
1702         ] }
1703         { [ dup CHAR: > = ] [
1704             drop abrupt-doctype-public-identifier
1705             pick [ force-quirks ] [ emit-doctype ] bi
1706             data-state
1707         ] }
1708         { [ dup f = ] [
1709             drop eof-in-doctype
1710             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1711         ] }
1712         [ reach push-doctype-public-identifier doctype-public-identifier-double-quoted-state ]
1713     } cond ;
1714
1715 : doctype-public-identifier-double-quoted-state ( document n/f string -- document n'/f string )
1716     take-char (doctype-public-identifier-double-quoted-state) ;
1717
1718
1719 : (doctype-public-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1720     {
1721         { [ dup CHAR: ' = ] [ drop after-doctype-public-identifier-state ] }
1722         { [ dup CHAR: \0 = ] [
1723             drop
1724             unexpected-null-character
1725             CHAR: replacement-character pick push-doctype-public-identifier
1726             doctype-public-identifier-double-quoted-state
1727         ] }
1728         { [ dup CHAR: > = ] [
1729             drop abrupt-doctype-public-identifier
1730             pick [ force-quirks ] [ emit-doctype ] bi
1731             data-state
1732         ] }
1733         { [ dup f = ] [
1734             drop eof-in-doctype
1735             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1736         ] }
1737         [ reach push-doctype-public-identifier doctype-public-identifier-single-quoted-state ]
1738     } cond ;
1739
1740 : doctype-public-identifier-single-quoted-state ( document n/f string -- document n'/f string )
1741     take-char (doctype-public-identifier-single-quoted-state) ;
1742
1743
1744 : (after-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
1745     {
1746         { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1747         { [ dup CHAR: > = ] [
1748             drop pick emit-doctype
1749             data-state
1750         ] }
1751         { [ dup CHAR: " = ] [
1752             drop missing-whitespace-between-doctype-public-and-system-identifiers
1753             pick initialize-doctype-system-identifier
1754             doctype-system-identifier-double-quoted-state
1755         ] }
1756         { [ dup CHAR: ' = ] [
1757             drop missing-whitespace-between-doctype-public-and-system-identifiers
1758             pick initialize-doctype-system-identifier
1759             doctype-system-identifier-single-quoted-state
1760         ] }
1761         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1762         [
1763             missing-quote-before-doctype-system-identifier
1764             [ reach force-quirks ] dip
1765             (bogus-doctype-state)
1766         ]
1767     } cond ;
1768
1769 : after-doctype-public-identifier-state ( document n/f string -- document n'/f string )
1770     take-char (after-doctype-public-identifier-state) ;
1771
1772
1773 : (between-doctype-public-and-system-identifiers-state) ( document n/f string ch/f -- document n'/f string )
1774     {
1775         { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1776         { [ dup CHAR: > = ] [
1777             drop pick emit-doctype
1778             data-state
1779         ] }
1780         { [ dup CHAR: " = ] [
1781             drop pick initialize-doctype-system-identifier
1782             doctype-system-identifier-double-quoted-state
1783         ] }
1784         { [ dup CHAR: ' = ] [
1785             drop pick initialize-doctype-system-identifier
1786             doctype-system-identifier-single-quoted-state
1787         ] }
1788         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1789         [
1790             missing-quote-before-doctype-system-identifier
1791             [ reach force-quirks ] dip
1792             (bogus-doctype-state)
1793         ]
1794     } cond ;
1795
1796 : between-doctype-public-and-system-identifiers-state ( document n/f string -- document n'/f string )
1797     take-char (between-doctype-public-and-system-identifiers-state) ;
1798
1799
1800 : (after-doctype-system-keyword-state) ( document n/f string ch/f -- document n'/f string )
1801     {
1802         { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1803         { [ dup CHAR: " = ] [
1804             drop missing-whitespace-after-doctype-system-keyword
1805             pick initialize-doctype-system-identifier
1806             doctype-system-identifier-double-quoted-state
1807         ] }
1808         { [ dup CHAR: ' = ] [
1809             drop missing-whitespace-after-doctype-system-keyword
1810             pick initialize-doctype-system-identifier
1811             doctype-system-identifier-single-quoted-state
1812         ] }
1813         { [ dup CHAR: > = ] [
1814             drop missing-doctype-system-identifier
1815             pick [ force-quirks ] [ emit-doctype ] bi
1816             data-state
1817         ] }
1818         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1819         [
1820             missing-quote-before-doctype-system-identifier
1821             [ reach force-quirks ] dip
1822             (bogus-doctype-state)
1823         ]
1824     } cond ;
1825
1826 : after-doctype-system-keyword-state ( document n/f string -- document n'/f string )
1827     take-char (after-doctype-system-keyword-state) ;
1828
1829
1830 : (before-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
1831     {
1832         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-system-identifier-state ] }
1833         { [ dup CHAR: " = ] [
1834             drop pick initialize-doctype-system-identifier
1835             doctype-system-identifier-double-quoted-state
1836         ] }
1837         { [ dup CHAR: ' = ] [
1838             drop pick initialize-doctype-system-identifier
1839             doctype-system-identifier-single-quoted-state
1840         ] }
1841         { [ dup CHAR: > = ] [
1842             drop missing-doctype-system-identifier
1843             pick [ force-quirks ] [ emit-doctype ] bi
1844             data-state
1845         ] }
1846         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1847         [
1848             missing-quote-before-doctype-system-identifier
1849             [ reach force-quirks ] dip
1850             (bogus-doctype-state)
1851         ]
1852     } cond ;
1853
1854 : before-doctype-system-identifier-state ( document n/f string -- document n'/f string )
1855     take-char (before-doctype-system-identifier-state) ;
1856
1857
1858 : (doctype-system-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1859     {
1860         { [ dup CHAR: " = ] [ drop after-doctype-system-identifier-state ] }
1861         { [ dup CHAR: \0 = ] [
1862             drop
1863             unexpected-null-character
1864             CHAR: replacement-character pick push-doctype-system-identifier
1865             doctype-system-identifier-double-quoted-state
1866         ] }
1867         { [ dup CHAR: > = ] [
1868             drop abrupt-doctype-system-identifier
1869             pick [ force-quirks ] [ emit-doctype ] bi
1870             data-state
1871         ] }
1872         { [ dup f = ] [
1873             drop eof-in-doctype
1874             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1875         ] }
1876         [ reach push-doctype-system-identifier doctype-system-identifier-double-quoted-state ]
1877     } cond ;
1878
1879 : doctype-system-identifier-double-quoted-state ( document n/f string -- document n'/f string )
1880     take-char (doctype-system-identifier-double-quoted-state) ;
1881
1882
1883 : (doctype-system-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1884     {
1885         { [ dup CHAR: ' = ] [ drop after-doctype-system-identifier-state ] }
1886         { [ dup CHAR: \0 = ] [
1887             drop
1888             unexpected-null-character
1889             CHAR: replacement-character pick push-doctype-system-identifier
1890             doctype-system-identifier-double-quoted-state
1891         ] }
1892         { [ dup CHAR: > = ] [
1893             drop abrupt-doctype-system-identifier
1894             pick [ force-quirks ] [ emit-doctype ] bi
1895             data-state
1896         ] }
1897         { [ dup f = ] [
1898             drop eof-in-doctype
1899             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1900         ] }
1901         [ reach push-doctype-system-identifier doctype-system-identifier-single-quoted-state ]
1902     } cond ;
1903
1904 : doctype-system-identifier-single-quoted-state ( document n/f string -- document n'/f string )
1905     take-char (doctype-system-identifier-single-quoted-state) ;
1906
1907
1908 : (after-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
1909     {
1910         { [ dup "\t\n\f\s" member? ] [ drop after-doctype-system-identifier-state ] }
1911         { [ dup CHAR: > = ] [
1912             drop pick emit-doctype
1913             data-state
1914         ] }
1915         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1916         [
1917             unexpected-character-after-doctype-system-identifier
1918             [ reach force-quirks ] dip
1919             (bogus-doctype-state)
1920         ]
1921     } cond ;
1922
1923 : after-doctype-system-identifier-state ( document n/f string -- document n'/f string )
1924     take-char (after-doctype-system-identifier-state) ;
1925
1926
1927 : (bogus-doctype-state) ( document n/f string ch/f -- document n'/f string )
1928     {
1929         { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1930         { [ dup CHAR: \0 = ] [ drop unexpected-null-character bogus-doctype-state ] }
1931         { [ dup f = ] [ drop eof-in-doctype pick emit-eof ] }
1932         [ drop bogus-doctype-state ]
1933     } cond ;
1934
1935 : bogus-doctype-state ( document n/f string -- document n'/f string )
1936     take-char (bogus-doctype-state) ;
1937
1938
1939 : (cdata-section-state) ( document n/f string ch/f -- document n'/f string )
1940     {
1941         { [ dup CHAR: ] = ] [ drop cdata-section-bracket-state ] }
1942         { [ dup f = ] [ drop eof-in-cdata pick emit-eof ] }
1943         [ reach emit-char cdata-section-state ]
1944     } cond ;
1945
1946 : cdata-section-state ( document n/f string -- document n'/f string )
1947     take-char (cdata-section-state) ;
1948
1949
1950 : (cdata-section-bracket-state) ( document n/f string ch/f -- document n'/f string )
1951     {
1952         { [ dup CHAR: ] = ] [ drop cdata-section-end-state ] }
1953         [ [ CHAR: ] reach emit-char ] dip (cdata-section-state) ]
1954     } cond ;
1955
1956 : cdata-section-bracket-state ( document n/f string -- document n'/f string )
1957     take-char (cdata-section-bracket-state) ;
1958
1959
1960 : (cdata-section-end-state) ( document n/f string ch/f -- document n'/f string )
1961     {
1962         { [ dup CHAR: ] = ] [ reach emit-char cdata-section-end-state ] }
1963         { [ dup CHAR: > = ] [ drop data-state ] }
1964         [ [ "]]" reach emit-string ] dip (cdata-section-state) ]
1965     } cond ;
1966
1967 : cdata-section-end-state ( document n/f string -- document n'/f string )
1968     take-char (cdata-section-end-state) ;
1969
1970
1971 : (character-reference-state) ( document n/f string ch/f -- document n'/f string )
1972     [ CHAR: & reach ch>new-temporary-buffer ] dip
1973     {
1974         { [ dup ascii-alphanumeric? ] [ (named-character-reference-state) ] }
1975         { [ dup CHAR: # = ] [ reach push-temporary-buffer numeric-character-reference-state ] }
1976         [ reach flush-temporary-buffer (return-state) ]
1977     } cond ;
1978
1979 : character-reference-state ( document n/f string -- document n'/f string )
1980     take-char (character-reference-state) ;
1981
1982
1983 : (named-character-reference-state) ( document n/f string ch/f -- document n'/f string )
1984     reach push-temporary-buffer
1985     pick named-character-match?
1986     [
1987         drop ! exact match, drop prefix match
1988         ! XXX: check me
1989         {
1990             [ pick temporary-buffer-attribute? ]
1991             [ pick temporary-buffer>> ?last CHAR: ; = not ]
1992             [ 3dup peek-from { [ CHAR: = = ] [ ascii-alphanumeric? ] } 1|| ]
1993         } 0&& [
1994             unimplemented*
1995             flush-temporary-buffer
1996             return-state
1997         ] [
1998             pick [ take-named-character ] [ flush-temporary-buffer ] bi return-state
1999         ] if
2000     ] [
2001         ! prefix match?
2002         [ named-character-reference-state ]
2003         [ pick flush-temporary-buffer ambiguous-ampersand-state ] if
2004     ] if ;
2005
2006 : named-character-reference-state ( document n/f string -- document n'/f string )
2007     take-char (named-character-reference-state) ;
2008
2009
2010 : (ambiguous-ampersand-state) ( document n/f string ch/f -- document n'/f string )
2011     {
2012         { [ dup ascii-alphanumeric? ] [
2013             unimplemented*
2014         ] }
2015         { [ dup CHAR: ; = ] [ unknown-named-character-reference (return-state) ] }
2016         [ (return-state) ]
2017     } cond ;
2018
2019 : ambiguous-ampersand-state ( document n/f string -- document n'/f string )
2020     take-char (ambiguous-ampersand-state) ;
2021
2022
2023 : (numeric-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2024     {
2025         { [ dup "xX" member? ] [ reach push-temporary-buffer hexadecimal-character-reference-start-state ] }
2026         [ (decimal-character-reference-start-state) ]
2027     } cond ;
2028
2029 : numeric-character-reference-state ( document n/f string -- document n'/f string )
2030     take-char (numeric-character-reference-state) ;
2031
2032
2033 : (hexadecimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
2034     {
2035         { [ dup ascii-hex-digit? ] [ (hexadecimal-character-reference-state) ] }
2036         [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
2037     } cond ;
2038
2039 : hexadecimal-character-reference-start-state ( document n/f string -- document n'/f string )
2040     take-char (hexadecimal-character-reference-start-state) ;
2041
2042
2043 : (decimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
2044     {
2045         { [ dup ascii-digit? ] [ (decimal-character-reference-state) ] }
2046         [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
2047     } cond ;
2048
2049 : decimal-character-reference-start-state ( document n/f string -- document n'/f string )
2050     take-char (decimal-character-reference-start-state) ;
2051
2052
2053 : (hexadecimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2054     {
2055         { [ dup ascii-digit? ] [ unimplemented* ] }
2056         { [ dup ascii-upper-hex-digit? ] [ unimplemented* ] }
2057         { [ dup ascii-lower-hex-digit? ] [ unimplemented* ] }
2058         { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
2059         [ missing-semicolon-after-character-reference ]
2060     } cond ;
2061
2062 : hexadecimal-character-reference-state ( document n/f string -- document n'/f string )
2063     take-char (hexadecimal-character-reference-state) ;
2064
2065
2066 : (decimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2067     {
2068         { [ dup ascii-digit? ] [ unimplemented* ] }
2069         { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
2070         [ missing-semicolon-after-character-reference ]
2071     } cond ;
2072
2073 : decimal-character-reference-state ( document n/f string -- document n'/f string )
2074     take-char (decimal-character-reference-state) ;
2075
2076
2077 : (numeric-character-reference-end-state) ( document n/f string ch/f -- document n'/f string )
2078     {
2079         [ missing-semicolon-after-character-reference ]
2080     } cond ;
2081
2082 : numeric-character-reference-end-state ( document n/f string -- document n'/f string )
2083     take-char (numeric-character-reference-end-state) ;
2084
2085
2086
2087 : parse-html5 ( string -- document )
2088     [ <document> 0 ] dip data-state 2drop ;
2089