1 ! Copyright (C) 2020 Doug Coleman.
2 ! See https://factorcode.org/license.txt for BSD license.
4 USING: accessors arrays assocs combinators
5 combinators.short-circuit io io.encodings.utf8 io.files json
6 kernel math math.order memoize modern.slices prettyprint
7 sequences sequences.extras strings suffix-arrays words ;
11 : 1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip over push ; inline
12 : ?1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip [ over push ] when* ; inline
14 ! https://html.spec.whatwg.org/multipage/parsing.html#tokenization
16 ! https://infra.spec.whatwg.org/#namespaces
17 CONSTANT: html-namespace "http://www.w3.org/1999/xhtml"
18 CONSTANT: mathml-namespace "http://www.w3.org/1998/Math/MathML"
19 CONSTANT: svg-namespace "http://www.w3.org/2000/svg"
20 CONSTANT: xlink-namespace "http://www.w3.org/1999/xlink"
21 CONSTANT: xml-namespace "http://www.w3.org/XML/1998/namespace"
22 CONSTANT: xmlns-namespace "http://www.w3.org/2000/xmlns/"
29 DEFER: (rawtext-state)
30 DEFER: script-data-state
31 DEFER: (script-data-state)
32 DEFER: plaintext-state
33 DEFER: (plaintext-state)
35 DEFER: (tag-open-state)
36 DEFER: end-tag-open-state
37 DEFER: (end-tag-open-state)
39 DEFER: (tag-name-state)
40 DEFER: rcdata-less-than-sign-state
41 DEFER: (rcdata-less-than-sign-state)
42 DEFER: rcdata-end-tag-open-state
43 DEFER: (rcdata-end-tag-open-state)
44 DEFER: rcdata-end-tag-name-state
45 DEFER: (rcdata-end-tag-name-state)
46 DEFER: rawtext-less-than-sign-state
47 DEFER: (rawtext-less-than-sign-state)
48 DEFER: rawtext-end-tag-open-state
49 DEFER: (rawtext-end-tag-open-state)
50 DEFER: rawtext-end-tag-name-state
51 DEFER: (rawtext-end-tag-name-state)
52 DEFER: script-data-less-than-sign-state
53 DEFER: (script-data-less-than-sign-state)
54 DEFER: script-data-end-tag-open-state
55 DEFER: (script-data-end-tag-open-state)
56 DEFER: script-data-end-tag-name-state
57 DEFER: (script-data-end-tag-name-state)
58 DEFER: script-data-escape-start-state
59 DEFER: (script-data-escape-start-state)
60 DEFER: script-data-escape-start-dash-state
61 DEFER: (script-data-escape-start-dash-state)
62 DEFER: script-data-escaped-state
63 DEFER: (script-data-escaped-state)
64 DEFER: script-data-escaped-dash-state
65 DEFER: (script-data-escaped-dash-state)
66 DEFER: script-data-escaped-dash-dash-state
67 DEFER: (script-data-escaped-dash-dash-state)
68 DEFER: script-data-escaped-less-than-sign-state
69 DEFER: (script-data-escaped-less-than-sign-state)
70 DEFER: script-data-escaped-end-tag-open-state
71 DEFER: (script-data-escaped-end-tag-open-state)
72 DEFER: script-data-escaped-end-tag-name-state
73 DEFER: (script-data-escaped-end-tag-name-state)
74 DEFER: script-data-double-escape-start-state
75 DEFER: (script-data-double-escape-start-state)
76 DEFER: script-data-double-escaped-state
77 DEFER: (script-data-double-escaped-state)
78 DEFER: script-data-double-escaped-dash-state
79 DEFER: (script-data-double-escaped-dash-state)
80 DEFER: script-data-double-escaped-dash-dash-state
81 DEFER: (script-data-double-escaped-dash-dash-state)
82 DEFER: script-data-double-escaped-less-than-sign-state
83 DEFER: (script-data-double-escaped-less-than-sign-state)
84 DEFER: script-data-double-escape-end-state
85 DEFER: (script-data-double-escape-end-state)
86 DEFER: before-attribute-name-state
87 DEFER: (before-attribute-name-state)
88 DEFER: attribute-name-state
89 DEFER: (attribute-name-state)
90 DEFER: after-attribute-name-state
91 DEFER: (after-attribute-name-state)
92 DEFER: before-attribute-value-state
93 DEFER: (before-attribute-value-state)
94 DEFER: attribute-value-double-quoted-state
95 DEFER: (attribute-value-double-quoted-state)
96 DEFER: attribute-value-single-quoted-state
97 DEFER: (attribute-value-single-quoted-state)
98 DEFER: attribute-value-unquoted-state
99 DEFER: (attribute-value-unquoted-state)
100 DEFER: after-attribute-value-quoted-state
101 DEFER: (after-attribute-value-quoted-state)
102 DEFER: self-closing-start-tag-state
103 DEFER: (self-closing-start-tag-state)
104 DEFER: bogus-comment-state
105 DEFER: (bogus-comment-state)
106 DEFER: markup-declaration-open-state
107 DEFER: (markup-declaration-open-state)
108 DEFER: comment-start-state
109 DEFER: (comment-start-state)
110 DEFER: comment-start-dash-state
111 DEFER: (comment-start-dash-state)
113 DEFER: (comment-state)
114 DEFER: comment-less-than-sign-state
115 DEFER: (comment-less-than-sign-state)
116 DEFER: comment-less-than-sign-bang-state
117 DEFER: (comment-less-than-sign-bang-state)
118 DEFER: comment-less-than-sign-bang-dash-state
119 DEFER: (comment-less-than-sign-bang-dash-state)
120 DEFER: comment-less-than-sign-bang-dash-dash-state
121 DEFER: (comment-less-than-sign-bang-dash-dash-state)
122 DEFER: comment-end-dash-state
123 DEFER: (comment-end-dash-state)
124 DEFER: comment-end-state
125 DEFER: (comment-end-state)
126 DEFER: comment-end-bang-state
127 DEFER: (comment-end-bang-state)
129 DEFER: (doctype-state)
130 DEFER: before-doctype-name-state
131 DEFER: (before-doctype-name-state)
132 DEFER: doctype-name-state
133 DEFER: (doctype-name-state)
134 DEFER: after-doctype-name-state
135 DEFER: (after-doctype-name-state)
136 DEFER: after-doctype-public-keyword-state
137 DEFER: (after-doctype-public-keyword-state)
138 DEFER: before-doctype-public-identifier-state
139 DEFER: (before-doctype-public-identifier-state)
140 DEFER: doctype-public-identifier-double-quoted-state
141 DEFER: (doctype-public-identifier-double-quoted-state)
142 DEFER: doctype-public-identifier-single-quoted-state
143 DEFER: (doctype-public-identifier-single-quoted-state)
144 DEFER: after-doctype-public-identifier-state
145 DEFER: (after-doctype-public-identifier-state)
146 DEFER: between-doctype-public-and-system-identifiers-state
147 DEFER: (between-doctype-public-and-system-identifiers-state)
148 DEFER: after-doctype-system-keyword-state
149 DEFER: (after-doctype-system-keyword-state)
150 DEFER: before-doctype-system-identifier-state
151 DEFER: (before-doctype-system-identifier-state)
152 DEFER: doctype-system-identifier-double-quoted-state
153 DEFER: (doctype-system-identifier-double-quoted-state)
154 DEFER: doctype-system-identifier-single-quoted-state
155 DEFER: (doctype-system-identifier-single-quoted-state)
156 DEFER: after-doctype-system-identifier-state
157 DEFER: (after-doctype-system-identifier-state)
158 DEFER: bogus-doctype-state
159 DEFER: (bogus-doctype-state)
160 DEFER: cdata-section-state
161 DEFER: (cdata-section-state)
162 DEFER: cdata-section-bracket-state
163 DEFER: (cdata-section-bracket-state)
164 DEFER: cdata-section-end-state
165 DEFER: (cdata-section-end-state)
166 DEFER: character-reference-state
167 DEFER: (character-reference-state)
168 DEFER: named-character-reference-state
169 DEFER: (named-character-reference-state)
170 DEFER: ambiguous-ampersand-state
171 DEFER: (ambiguous-ampersand-state)
172 DEFER: numeric-character-reference-state
173 DEFER: (numeric-character-reference-state)
174 DEFER: hexadecimal-character-reference-start-state
175 DEFER: (hexadecimal-character-reference-start-state)
176 DEFER: decimal-character-reference-start-state
177 DEFER: (decimal-character-reference-start-state)
178 DEFER: hexadecimal-character-reference-state
179 DEFER: (hexadecimal-character-reference-state)
180 DEFER: decimal-character-reference-state
181 DEFER: (decimal-character-reference-state)
182 DEFER: numeric-character-reference-end-state
183 DEFER: (numeric-character-reference-end-state)
186 ERROR: unimplemented string ;
187 ERROR: unimplemented* ;
189 ! Errors: https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
190 ERROR: abrupt-closing-of-empty-comment ;
191 ERROR: abrupt-doctype-public-identifier ;
192 ERROR: abrupt-doctype-system-identifier ;
193 ERROR: absence-of-digits-in-numeric-character-reference ;
194 ERROR: cdata-in-html-content ;
195 ERROR: character-reference-outside-unicode-range ;
196 ERROR: control-character-in-input-stream ;
197 ERROR: control-character-reference ;
198 ERROR: end-tag-with-attributes ;
199 ERROR: duplicate-attribute ;
200 ERROR: end-tag-with-trailing-solidus ;
201 ERROR: eof-before-tag-name ;
202 ERROR: eof-in-cdata ;
203 ERROR: eof-in-comment ;
204 ERROR: eof-in-doctype ;
205 ERROR: eof-in-script-html-comment-like-text ;
207 ERROR: incorrectly-closed-comment ;
208 ERROR: incorrectly-opened-comment ;
209 ERROR: invalid-character-sequence-after-doctype-name ;
210 ERROR: invalid-first-character-of-tag-name ;
211 ERROR: missing-attribute-value ;
212 ERROR: missing-doctype-name ;
213 ERROR: missing-doctype-public-identifier ;
214 ERROR: missing-doctype-system-identifier ;
215 ERROR: missing-end-tag-name ;
216 ERROR: missing-quote-before-doctype-public-identifier ;
218 ERROR: missing-quote-before-doctype-system-identifier ;
219 ERROR: missing-semicolon-after-character-reference ;
220 ERROR: missing-whitespace-after-doctype-public-keyword ;
221 ERROR: missing-whitespace-after-doctype-system-keyword ;
222 ERROR: missing-whitespace-before-doctype-name ;
223 ERROR: missing-whitespace-between-attributes ;
224 ERROR: missing-whitespace-between-doctype-public-and-system-identifiers ;
225 ERROR: nested-comment ;
226 ERROR: noncharacter-character-reference ;
227 ERROR: noncharacter-in-input-stream ;
228 ERROR: non-void-html-element-start-tag-with-trailing-solidus ;
229 ERROR: null-character-reference ;
230 ERROR: surrogate-character-reference ;
231 ERROR: surrogate-in-input-stream ;
232 ERROR: unexpected-character-after-doctype-system-identifier ;
233 ERROR: unexpected-character-in-attribute-name ;
234 ERROR: unexpected-character-in-unquoted-attribute-value ;
235 ERROR: unexpected-equals-sign-before-attribute-name ;
236 ERROR: unexpected-null-character ;
237 ERROR: unexpected-question-mark-instead-of-tag-name ;
238 ERROR: unexpected-solidus-in-tag ;
239 ERROR: unknown-named-character-reference ;
241 ! Tree insertion modes
242 SINGLETONS: initial-mode before-html-mode before-head-mode
243 in-head-mode in-head-noscript-mode after-head-mode
244 in-body-mode text-mode in-table-mode in-table-text-mode
245 in-caption-mode in-column-group-mode in-table-body-mode
246 in-row-mode in-cell-mode in-select-mode in-select-in-table-mode in-template-mode
247 after-body-mode in-frameset-mode after-frameset-mode after-after-body-mode
248 after-after-frameset-mode ;
254 scripting? ! set in constructor
255 frameset-ok? ! frameset-ok? but we want default to f
259 head-element-pointer ! set during insertion time
260 parser-cannot-change-mode-flag
262 original-insertion-mode
279 ! "reset the insertion mode appropriately"
280 ! : reset-insertion-mode ( document -- document )
282 ! dup open-elements>> ?last >>node
283 ! dup [ open-elements>> ?first ] [ node>> ] bi = [
284 ! t >>last dup node>> >>context
287 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
289 ! dup name>> >lower { "td" "th" } member?
290 ! pick last>> f = and
291 ! ] [ drop in-select >>insertion-mode ] }
292 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
293 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
294 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
295 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
296 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
297 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
298 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
299 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
300 ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
304 : temporary-buffer-attribute? ( document -- ? )
307 attribute-value-unquoted-state
308 attribute-value-single-quoted-state
309 attribute-value-double-quoted-state
312 ! name, public/system identifier should not be empty strings
313 ! until the state machine demands it
320 : <doctype> ( -- doctype )
323 : new-doctype-from-ch ( ch document -- )
327 ] dip doctype<< ; inline
329 : new-doctype-with-quirks ( document -- )
330 <doctype> t >>quirks? >>doctype drop ;
332 TUPLE: tag self-closing? name attributes children end-tag ;
337 V{ } clone >>attributes
338 V{ } clone >>children ;
340 TUPLE: end-tag self-closing? name attributes ;
342 : <end-tag> ( -- tag )
345 V{ } clone >>attributes ;
347 : new-tag ( document -- )
350 : new-end-tag ( document -- )
351 <end-tag> >>tag drop ;
353 : set-self-closing ( document -- )
354 tag>> t >>self-closing? drop ;
356 : <document> ( -- document )
359 initial-mode >>insertion-mode
362 ! SBUF" " clone >>tag-name
363 SBUF" " clone >>attribute-name
364 SBUF" " clone >>attribute-value
365 SBUF" " clone >>temporary-buffer
366 SBUF" " clone >>comment-token
367 V{ } clone >>open-elements
370 TUPLE: comment open payload close ;
372 : <comment> ( payload -- comment )
374 swap >>payload ; inline
376 : force-quirks ( document -- )
377 doctype>> t >>quirks? drop ;
379 : initialize-doctype-name ( document -- )
380 [ SBUF" " clone ] dip doctype>> name<< ;
382 : initialize-doctype-public-identifier ( document -- )
383 [ SBUF" " clone ] dip doctype>> public-identifier<< ;
385 : initialize-doctype-system-identifier ( document -- )
386 [ SBUF" " clone ] dip doctype>> system-identifier<< ;
388 : push-doctype-name ( ch document -- )
389 doctype>> name>> push ;
391 : push-doctype-public-identifier ( ch document -- )
392 doctype>> public-identifier>> push ;
394 : push-doctype-system-identifier ( ch document -- )
395 doctype>> system-identifier>> push ;
397 ! XXX: not html5 spec, fix
398 ERROR: unmatched-closing-tag-error stack tag ;
400 : unclosed-tag? ( obj -- ? )
401 { [ tag? ] [ end-tag>> not ] } 1&& ; inline
403 :: find-matching-tag ( name stack -- seq )
404 stack [ { [ unclosed-tag? ] [ name>> name = ] } 1&& ] find-last drop [
407 stack name unmatched-closing-tag-error
411 GENERIC: tree-insert* ( document obj insertion-mode -- document )
413 : limited-quirks-mode? ( doctype -- ? )
415 [ public-identifier>> "-//W3C//DTD XHTML 1.0 Frameset//" head? ]
416 [ public-identifier>> "-//W3C//DTD XHTML 1.0 Transitional//" head? ]
417 [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Frameset//" head? ] } 1&& ]
418 [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Transitional//" head? ] } 1&& ]
421 ! https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode
422 M: initial-mode tree-insert*
424 { [ dup "\t\n\f\r\s" member? ] [ drop ] }
426 >>tree-doctype before-html-mode >>insertion-mode
428 { [ dup comment? ] [ over tree>> push ] }
429 { [ dup tag? ] [ over tree>> push ] }
431 dup name>> pick tree>> find-matching-tag
438 over iframe-srcdoc?>> [
439 over parser-cannot-change-mode-flag>> [
440 [ t >>quirks-mode? ] dip
443 "must be iframe-srcdoc here" throw
445 ! reprocess the token
446 before-html-mode >>insertion-mode tree-insert
450 ! https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
451 M: before-html-mode tree-insert*
453 { [ dup doctype? ] [ drop ] }
454 { [ dup comment? ] [ over tree>> push ] }
455 { [ dup "\t\n\f\r\s" member? ] [ drop ] }
456 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
458 before-head-mode >>insertion-mode
460 ! these tags are handled in the default case
461 { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
462 ! error end-tag, ignore
466 ! Create missing html tag and reprocess the token
467 <tag> "html" >>name pick tree>> push
468 before-head-mode >>insertion-mode tree-insert
472 M: before-head-mode tree-insert*
474 { [ dup "\t\n\f\r\s" member? ] [ drop ] }
475 { [ dup comment? ] [ over tree>> push ] }
476 { [ dup doctype? ] [ drop ] }
477 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
478 ! XXX: in-body-mode rules here for html tag
481 ! before-head-mode >>insertion-mode
482 "handle html in-body-mode here" throw
484 { [ dup { [ tag? ] [ name>> "head" = ] } 1&& ] [
486 [ >>head-element-pointer drop ]
487 [ drop in-head-mode >>insertion-mode ] 2tri
489 ! these tags are handled in the default case
490 { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
491 ! error end-tag, ignore
495 { [ dup tag? ] [ drop ] }
497 ! Create missing html tag and reprocess the token
499 [ "head" >>name pick tree>> push ]
500 [ >>head-element-pointer ] bi
501 in-head-mode >>insertion-mode tree-insert
505 M: in-head-mode tree-insert*
507 { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
508 { [ dup comment? ] [ over tree>> push ] }
509 { [ dup doctype? ] [ drop ] }
510 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
511 ! XXX: in-body-mode rules here for html tag
514 ! before-head-mode >>insertion-mode
515 "handle html in-body-mode here" throw
518 { [ dup { [ tag? ] [ name>> { "base" "basefont" "bgsound" "link" } member? ] } 1&& ] [
519 ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
522 { [ dup { [ tag? ] [ name>> "meta" = ] } 1&& ] [
525 { [ dup { [ tag? ] [ name>> "title" = ] } 1&& ] [
526 ! https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm
527 "insert title node" throw
532 [ { [ tag? ] [ name>> "noscript" = ] [ scripting?>> ] } 1&& ]
533 [ { [ tag? ] [ name>> { "noframes" "style" } member? ] } 1&& ]
536 ! https://html.spec.whatwg.org/multipage/parsing.html#generic-raw-text-element-parsing-algorithm
539 { [ dup { [ tag? ] [ name>> "noscript" = ] [ scripting?>> not ] } 1&& ] [
542 in-head-noscript-mode >>insertion-mode
544 { [ dup { [ tag? ] [ name>> "script" = ] } 1&& ] [
546 text-mode >>insertion-mode
548 { [ dup { [ end-tag? ] [ name>> "head" = ] } 1&& ] [
549 over tree>> last end-tag<<
550 after-head-mode >>insertion-mode
552 { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? ] } 1&& ] [
553 ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
556 { [ dup { [ tag? ] [ name>> "template" = ] } 1&& ] [
558 in-template-mode >>insertion-mode
560 { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
565 [ { [ tag? ] [ name>> "head" = ] } 1&& ]
567 } 1|| ] [ drop "ignore here" throw ] }
569 ! end head tag should be here, pop off, reprocess
570 over tree>> pop swap >>end-tag
571 after-head-mode >>insertion-mode "omg" throw
575 M: in-head-noscript-mode tree-insert* drop unimplemented* ;
577 M: after-head-mode tree-insert*
579 { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
580 { [ dup comment? ] [ over tree>> push ] }
581 { [ dup doctype? ] [ drop ] }
582 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
583 ! XXX: in-body-mode rules here for html tag
586 ! before-head-mode >>insertion-mode
587 "handle html in-body-mode here" throw
590 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
593 { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [
596 in-body-mode >>insertion-mode
598 { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [
603 "base" "basefont" "bgsound" "link" "meta"
604 "noframes" "script" "style" "template" "title"
609 { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
612 ! same as default case
613 ! { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ] [
618 [ { [ tag? ] [ name>> "head" = ] } 1&& ]
619 [ { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ]
622 "omg revisit this" throw
627 <tag> "body" >>name pick tree>> push
628 in-body-mode >>insertion-mode tree-insert
632 M: in-body-mode tree-insert*
634 { [ dup CHAR: \0 = ] [ drop ] }
635 { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
636 { [ dup comment? ] [ over tree>> push ] }
637 { [ dup doctype? ] [ drop ] }
638 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ drop ] }
645 "base" "basefont" "bgsound" "link" "meta"
646 "noframes" "script" "style" "template" "title"
651 { [ end-tag? ] [ name>> "template" = ] } 1&&
657 { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [ drop unimplemented* ] }
658 { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [ drop unimplemented* ] }
661 { [ dup { [ end-tag? ] [ name>> "body" = ] } 1&& ] [
662 "body" pick tree>> find-matching-tag
668 after-body-mode >>insertion-mode
670 { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [ drop unimplemented* ] }
677 M: text-mode tree-insert* drop unimplemented* ;
678 M: in-table-mode tree-insert* drop unimplemented* ;
679 M: in-table-text-mode tree-insert* drop unimplemented* ;
680 M: in-caption-mode tree-insert* drop unimplemented* ;
681 M: in-column-group-mode tree-insert* drop unimplemented* ;
682 M: in-table-body-mode tree-insert* drop unimplemented* ;
683 M: in-row-mode tree-insert* drop unimplemented* ;
684 M: in-cell-mode tree-insert* drop unimplemented* ;
685 M: in-select-mode tree-insert* drop unimplemented* ;
686 M: in-select-in-table-mode tree-insert* drop unimplemented* ;
687 M: in-template-mode tree-insert* drop unimplemented* ;
688 M: after-body-mode tree-insert*
690 { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
691 { [ dup comment? ] [ over tree>> push ] }
692 { [ dup doctype? ] [ drop ] }
693 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
696 { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [
697 ! XXX: make this a function
698 "html" pick tree>> find-matching-tag
704 after-after-body-mode >>insertion-mode
710 M: in-frameset-mode tree-insert* drop unimplemented* ;
711 M: after-frameset-mode tree-insert* drop unimplemented* ;
713 M: after-after-body-mode tree-insert*
715 { [ dup comment? ] [ over tree>> push ] }
716 { [ dup doctype? ] [ unimplemented* ] }
717 { [ dup "\t\n\f\r\s" member? ] [ unimplemented* ] }
718 { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ unimplemented* ] }
720 { [ dup f = ] [ drop ] }
723 [ in-body-mode >>insertion-mode ] dip tree-insert
727 M: after-after-frameset-mode tree-insert* drop unimplemented* ;
729 : tree-insert ( document obj -- document )
730 over insertion-mode>> tree-insert* ;
732 MEMO: load-entities ( -- assoc )
733 "vocab:html5/entities.json" utf8 file-contents json> ;
735 MEMO: entities-suffix-array ( -- assoc )
736 load-entities keys >suffix-array ;
738 : lookup-entity ( string -- entity/string ? )
741 : named-character-match? ( document -- prefix? exact? )
743 [ entities-suffix-array query f like ]
744 [ last CHAR: ; = ] bi ;
746 ERROR: unknown-named-entity entity ;
747 : take-named-character ( document -- )
749 temporary-buffer>> >string lookup-entity [
751 SBUF" " clone-like >>temporary-buffer drop
756 ! XXX: remove the tag>> name>> push part
757 : push-tag-name ( ch document -- )
758 [ tag>> name>> push ]
760 2drop ! tag-name>> push
762 : push-attribute-name ( ch document -- ) attribute-name>> push ;
763 : push-attribute-value ( ch document -- ) attribute-value>> push ;
764 : push-comment-token ( ch document -- ) comment-token>> push ;
765 : push-all-comment-token ( string document -- ) comment-token>> push-all ;
767 ERROR: invalid-return-state obj ;
768 : check-return-state ( obj -- return-state )
769 dup word? [ invalid-return-state ] unless ;
771 : current-attribute ( document -- attribute/f )
772 [ attribute-name>> >string f like ]
773 [ attribute-value>> >string f like ] bi
774 2dup or [ 2array ] [ 2drop f ] if ;
776 : push-when ( obj/f seq -- )
777 over [ push ] [ 2drop ] if ; inline
779 : reset-attribute ( document -- )
780 SBUF" " clone >>attribute-name
781 SBUF" " clone >>attribute-value drop ;
783 : push-attribute ( document -- )
784 [ current-attribute ]
785 [ tag>> attributes>> push-when ]
786 [ reset-attribute ] tri ;
788 : emit-eof ( document -- )
791 : emit-char ( char document -- ) drop "emit-char: " write 1string . ;
792 : emit-string ( char document -- ) drop "emit-string: " write . ;
793 : emit-tag ( document -- )
796 [ tag>> [ name>> >string ] [ name<< ] bi ]
799 [ dup tag>> tree-insert drop ]
802 : emit-end-tag ( document -- )
803 "emit-end-tag: " write
805 [ f >>tag drop ] bi ;
806 : emit-comment-token ( document -- )
807 "emit-comment-token: " write
809 [ comment-token>> >string . ]
810 [ dup comment-token>> >string <comment> tree-insert drop ]
811 [ SBUF" " clone >>comment-token drop ]
813 : emit-doctype ( document -- )
814 "emit-doctype: " write dup doctype>> .
816 [ doctype>> [ >string ] change-name drop ]
818 ! XXX: handle iframe srcdoc document
819 dup { [ doctype>> name>> "html" = not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
822 dup { [ iframe-srcdoc?>> not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
823 dup doctype>> limited-quirks-mode? [ t >>limited-quirks-mode? ] when
828 [ dup doctype>> tree-insert drop ]
832 : reset-temporary-buffer ( document -- ) SBUF" " clone temporary-buffer<< ;
833 : ch>new-temporary-buffer ( ch document -- ) [ 1sbuf ] dip temporary-buffer<< ;
834 : string>new-temporary-buffer ( string document -- ) [ SBUF" " clone-like ] dip temporary-buffer<< ;
835 : temporary-buffer-last ( document -- ch/f ) temporary-buffer>> ?last ;
836 : push-temporary-buffer ( ch document -- ) temporary-buffer>> push ;
837 : push-all-temporary-buffer ( string document -- ) temporary-buffer>> push-all ;
839 : flush-temporary-buffer ( document -- )
840 "flush-temporary-buffer: " write
841 [ [ temporary-buffer>> ] keep [ emit-char ] curry each ]
842 [ SBUF" " clone >>temporary-buffer drop ] bi ;
844 : emit-temporary-buffer-with ( string document -- )
845 [ temporary-buffer>> push-all ]
846 [ flush-temporary-buffer ] bi ;
848 ! check if matches open tag
849 : appropriate-end-tag-token? ( document -- ? )
852 : ascii-upper-alpha? ( ch -- ? ) [ CHAR: A CHAR: Z between? ] [ f ] if* ; inline
853 : ascii-lower-alpha? ( ch -- ? ) [ CHAR: a CHAR: z between? ] [ f ] if* ; inline
854 : ascii-upper-hex-digit? ( ch -- ? ) [ CHAR: A CHAR: F between? ] [ f ] if* ; inline
855 : ascii-lower-hex-digit? ( ch -- ? ) [ CHAR: a CHAR: f between? ] [ f ] if* ; inline
856 : ascii-hex-alpha? ( ch -- ? ) { [ ascii-upper-hex-digit? ] [ ascii-lower-hex-digit? ] } 1|| ; inline
858 : ascii-digit? ( ch/f -- ? ) [ CHAR: 0 CHAR: 9 between? ] [ f ] if* ;
859 : ascii-alpha? ( ch/f -- ? ) { [ ascii-lower-alpha? ] [ ascii-upper-alpha? ] } 1|| ;
860 : ascii-alphanumeric? ( ch/f -- ? ) { [ ascii-alpha? ] [ ascii-digit? ] } 1|| ;
861 : ascii-hex-digit? ( ch/f -- ? ) { [ ascii-digit? ] [ ascii-hex-alpha? ] } 1|| ;
863 : (return-state) ( document n/f string ch/f -- document n'/f string )
864 reach [ f ] change-return-state drop check-return-state
865 execute( document n/f string ch/f -- document n'/f string ) ;
867 : return-state ( document n/f string -- document n'/f string )
868 pick [ f ] change-return-state drop check-return-state
869 execute( document n/f string -- document n'/f string ) ;
871 : (data-state) ( document n/f string ch/f -- document n'/f string )
873 { [ dup CHAR: & = ] [ drop [ \ data-state >>return-state ] 2dip character-reference-state ] }
874 { [ dup CHAR: < = ] [ drop tag-open-state ] }
875 { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
876 { [ dup f = ] [ drop pick emit-eof ] }
877 [ reach emit-char data-state ]
880 : data-state ( document n/f string -- document n'/f string )
881 take-char (data-state) ;
884 : (rcdata-state) ( document n/f string ch/f -- document n'/f string )
886 { [ dup CHAR: & = ] [ drop [ \ rcdata-state >>return-state ] 2dip character-reference-state ] }
887 { [ dup CHAR: < = ] [ drop rcdata-less-than-sign-state ] }
888 { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
889 { [ dup f = ] [ drop pick emit-eof ] }
890 [ reach emit-char rcdata-state ]
893 : rcdata-state ( document n/f string -- document n'/f string )
894 take-char (rcdata-state) ;
897 : (rawtext-state) ( document n/f string ch/f -- document n'/f string )
899 { [ dup CHAR: < = ] [ drop rawtext-less-than-sign-state ] }
900 { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
901 { [ dup f = ] [ drop pick emit-eof ] }
902 [ reach emit-char rawtext-state ]
905 : rawtext-state ( document n/f string -- document n'/f string )
906 take-char (rawtext-state) ;
909 : (script-data-state) ( document n/f string ch/f -- document n'/f string )
911 { [ dup CHAR: < = ] [ drop script-data-less-than-sign-state ] }
912 { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
913 { [ dup f = ] [ drop pick emit-eof ] }
914 [ reach emit-char script-data-state ]
917 : script-data-state ( document n/f string -- document n'/f string )
918 take-char (script-data-state) ;
921 : (plaintext-state) ( document n/f string ch/f -- document n'/f string )
923 { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
924 { [ dup f = ] [ drop pick emit-eof ] }
925 [ reach emit-char plaintext-state ]
928 : plaintext-state ( document n/f string -- document n'/f string )
929 take-char (plaintext-state) ;
932 : (tag-open-state) ( document n/f string ch/f -- document n'/f string )
934 { [ dup ascii-alpha? ] [ reach new-tag (tag-name-state) ] }
935 { [ dup CHAR: ! = ] [ drop markup-declaration-open-state ] }
936 { [ dup CHAR: / = ] [ drop end-tag-open-state ] }
937 { [ dup CHAR: ? = ] [ unexpected-question-mark-instead-of-tag-name ] }
938 { [ dup f = ] [ eof-before-tag-name ] }
939 [ invalid-first-character-of-tag-name ]
942 : tag-open-state ( document n/f string -- document n'/f string )
943 take-char (tag-open-state) ;
946 : (end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
948 { [ dup ascii-alpha? ] [ reach new-end-tag (tag-name-state) ] }
949 { [ dup CHAR: > = ] [ missing-end-tag-name ] }
950 { [ dup f = ] [ eof-before-tag-name ] }
951 [ invalid-first-character-of-tag-name ]
954 : end-tag-open-state ( document n/f string -- document n'/f string )
955 take-char (end-tag-open-state) ;
958 : (tag-name-state) ( document n/f string ch/f -- document n'/f string )
960 { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-tag-name tag-name-state ] }
961 { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
962 { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
963 { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
964 { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
965 { [ dup f = ] [ eof-before-tag-name ] }
966 [ reach push-tag-name tag-name-state ]
969 : tag-name-state ( document n/f string -- document n'/f string )
970 take-char (tag-name-state) ;
973 : (rcdata-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
975 { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rcdata-end-tag-open-state ] }
976 [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
979 : rcdata-less-than-sign-state ( document n/f string -- document n'/f string )
980 take-char (rcdata-less-than-sign-state) ;
983 : (rcdata-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
985 { [ dup ascii-alpha? ] [ reach new-end-tag (rcdata-end-tag-name-state) ] }
986 [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
989 : rcdata-end-tag-open-state ( document n/f string -- document n'/f string )
990 take-char (rcdata-end-tag-open-state) ;
993 : (rcdata-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
995 { [ dup "\t\n\f\s" member? ] [
996 drop pick appropriate-end-tag-token?
997 [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
999 { [ dup CHAR: / = ] [
1000 drop pick appropriate-end-tag-token?
1001 [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
1003 { [ dup CHAR: > = ] [
1004 drop pick appropriate-end-tag-token?
1005 [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
1007 { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
1008 { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
1009 [ [ "</" reach emit-temporary-buffer-with ] dip (rcdata-state) ]
1012 : rcdata-end-tag-name-state ( document n/f string -- document n'/f string )
1013 take-char (rcdata-end-tag-name-state) ;
1016 : (rawtext-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1018 { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rawtext-end-tag-open-state ] }
1019 [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
1022 : rawtext-less-than-sign-state ( document n/f string -- document n'/f string )
1023 take-char (rawtext-less-than-sign-state) ;
1026 : (rawtext-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1028 { [ dup ascii-alpha? ] [ reach new-end-tag (rawtext-end-tag-name-state) ] }
1029 [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
1032 : rawtext-end-tag-open-state ( document n/f string -- document n'/f string )
1033 take-char (rawtext-end-tag-open-state) ;
1036 : (rawtext-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1038 { [ dup "\t\n\f\s" member? ] [
1039 drop pick appropriate-end-tag-token?
1040 [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1042 { [ dup CHAR: / = ] [
1043 drop pick appropriate-end-tag-token?
1044 [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1046 { [ dup CHAR: > = ] [
1047 drop pick appropriate-end-tag-token?
1048 [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1050 { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1051 { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1052 [ [ "</" reach emit-temporary-buffer-with ] dip (rawtext-state) ]
1055 : rawtext-end-tag-name-state ( document n/f string -- document n'/f string )
1056 take-char (rawtext-end-tag-name-state) ;
1059 : (script-data-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1061 { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-end-tag-open-state ] }
1062 { [ dup CHAR: ! = ] [ drop "<!" reach emit-string script-data-escape-start-state ] }
1063 [ [ CHAR: < reach emit-char ] dip (script-data-state) ]
1066 : script-data-less-than-sign-state ( document n/f string -- document n'/f string )
1067 take-char (script-data-less-than-sign-state) ;
1070 : (script-data-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1072 { [ dup ascii-alpha? ] [ reach new-end-tag (script-data-end-tag-name-state) ] }
1073 [ [ "</" reach emit-string ] dip (script-data-state) ]
1076 : script-data-end-tag-open-state ( document n/f string -- document n'/f string )
1077 take-char (script-data-end-tag-open-state) ;
1080 : (script-data-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1082 { [ dup "\t\n\f\s" member? ] [
1083 drop pick appropriate-end-tag-token?
1084 [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1086 { [ dup CHAR: / = ] [
1087 drop pick appropriate-end-tag-token?
1088 [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1090 { [ dup CHAR: > = ] [
1091 drop pick appropriate-end-tag-token?
1092 [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1094 { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1095 { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1096 [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-state) ]
1099 : script-data-end-tag-name-state ( document n/f string -- document n'/f string )
1100 take-char (script-data-end-tag-name-state) ;
1103 : (script-data-escape-start-state) ( document n/f string ch/f -- document n'/f string )
1105 { [ dup CHAR: - = ] [ drop script-data-escape-start-dash-state ] }
1106 [ (script-data-state) ]
1109 : script-data-escape-start-state ( document n/f string -- document n'/f string )
1110 take-char (script-data-escape-start-state) ;
1113 : (script-data-escape-start-dash-state) ( document n/f string ch/f -- document n'/f string )
1115 { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
1116 [ (script-data-state) ]
1119 : script-data-escape-start-dash-state ( document n/f string -- document n'/f string )
1120 take-char (script-data-escape-start-dash-state) ;
1123 : (script-data-escaped-state) ( document n/f string ch/f -- document n'/f string )
1125 { [ dup CHAR: - = ] [ drop script-data-escaped-dash-state ] }
1126 { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1127 { [ dup CHAR: \0 = ] [ unexpected-null-character CHAR: replacement-character unimplemented* ] }
1128 { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1129 [ reach emit-char script-data-escaped-state ]
1132 : script-data-escaped-state ( document n/f string -- document n'/f string )
1133 take-char (script-data-escaped-state) ;
1136 : (script-data-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
1138 { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
1139 { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1140 { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
1141 { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1142 [ reach emit-char script-data-escaped-state ]
1145 : script-data-escaped-dash-state ( document n/f string -- document n'/f string )
1146 take-char (script-data-escaped-dash-state) ;
1149 : (script-data-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1151 { [ dup CHAR: - = ] [ reach emit-char script-data-escaped-dash-dash-state ] }
1152 { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1153 { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
1154 { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
1155 { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1156 [ reach emit-char script-data-escaped-state ]
1159 : script-data-escaped-dash-dash-state ( document n/f string -- document n'/f string )
1160 take-char (script-data-escaped-dash-dash-state) ;
1163 : (script-data-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1165 { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-escaped-end-tag-open-state ] }
1166 { [ dup ascii-alpha? ] [ [ pick reset-temporary-buffer CHAR: < reach emit-char ] dip (script-data-double-escape-start-state) ] }
1167 [ [ CHAR: < reach emit-char ] dip (script-data-escaped-state) ]
1170 : script-data-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
1171 take-char (script-data-escaped-less-than-sign-state) ;
1174 : (script-data-escaped-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1176 { [ dup ascii-alpha? ] [ [ pick new-end-tag ] dip (script-data-escaped-end-tag-name-state) ] }
1177 [ [ "</" reach emit-string ] dip (script-data-escaped-state) ]
1180 : script-data-escaped-end-tag-open-state ( document n/f string -- document n'/f string )
1181 take-char (script-data-escaped-end-tag-open-state) ;
1184 : (script-data-escaped-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1186 { [ dup "\t\n\f\s" member? ] [
1187 drop pick appropriate-end-tag-token?
1188 [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1190 { [ dup CHAR: / = ] [
1191 drop pick appropriate-end-tag-token?
1192 [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1194 { [ dup CHAR: > = ] [
1195 drop pick appropriate-end-tag-token?
1196 [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1198 { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
1199 { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
1200 [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-escaped-state) ]
1203 : script-data-escaped-end-tag-name-state ( document n/f string -- document n'/f string )
1204 take-char (script-data-escaped-end-tag-name-state) ;
1207 : (script-data-double-escape-start-state) ( document n/f string ch/f -- document n'/f string )
1209 { [ dup "\t\n\f\s/>" member? ] [
1211 pick temporary-buffer>> "script" sequence=
1212 [ script-data-double-escaped-state ] [ script-data-escaped-state ] if
1214 { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] }
1215 { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] } ! todo
1216 [ (script-data-escaped-state) ]
1219 : script-data-double-escape-start-state ( document n/f string -- document n'/f string )
1220 take-char (script-data-double-escape-start-state) ;
1223 : (script-data-double-escaped-state) ( document n/f string ch/f -- document n'/f string )
1225 { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-state ] }
1226 { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1227 { [ dup CHAR: \0 = ] [
1228 unexpected-null-character
1229 CHAR: replacement-character reach emit-char
1230 script-data-double-escaped-state
1232 { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1233 [ reach emit-char script-data-double-escaped-state ]
1236 : script-data-double-escaped-state ( document n/f string -- document n'/f string )
1237 take-char (script-data-double-escaped-state) ;
1240 : (script-data-double-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
1242 { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
1243 { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1244 { [ dup CHAR: \0 = ] [
1245 unexpected-null-character
1246 CHAR: replacement-character reach emit-char
1247 script-data-double-escaped-state
1249 { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1250 [ reach emit-char script-data-double-escaped-state ]
1253 : script-data-double-escaped-dash-state ( document n/f string -- document n'/f string )
1254 take-char (script-data-double-escaped-dash-state) ;
1257 : (script-data-double-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1259 { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
1260 { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1261 { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
1262 { [ dup CHAR: \0 = ] [
1263 unexpected-null-character
1264 CHAR: replacement-character reach emit-char
1265 script-data-double-escaped-state
1267 { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1268 [ reach emit-char script-data-escaped-state ]
1271 : script-data-double-escaped-dash-dash-state ( document n/f string -- document n'/f string )
1272 take-char (script-data-double-escaped-dash-dash-state) ;
1275 : (script-data-double-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1277 { [ dup CHAR: / = ] [ reach emit-char pick reset-temporary-buffer script-data-double-escape-end-state ] }
1278 [ (script-data-double-escaped-state) ]
1281 : script-data-double-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
1282 take-char (script-data-double-escaped-less-than-sign-state) ;
1285 : (script-data-double-escape-end-state) ( document n/f string ch/f -- document n'/f string )
1287 { [ dup "\t\n\f\s/>" member? ] [
1289 pick temporary-buffer>> "script" sequence=
1290 [ script-data-escaped-state ] [ script-data-double-escaped-state ] if
1292 { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] }
1293 { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] } ! todo
1294 [ (script-data-double-escaped-state) ]
1297 : script-data-double-escape-end-state ( document n/f string -- document n'/f string )
1298 take-char (script-data-double-escape-end-state) ;
1301 : (before-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1303 { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1304 { [ dup "/>" member? ] [ (after-attribute-name-state) ] }
1305 { [ dup f = ] [ (after-attribute-name-state) ] }
1306 { [ dup CHAR: = = ] [ unexpected-equals-sign-before-attribute-name ] }
1307 [ reach push-attribute (attribute-name-state) ]
1310 : before-attribute-name-state ( document n/f string -- document n'/f string )
1311 take-char (before-attribute-name-state) ;
1314 : (attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1316 { [ dup "\t\n\f\s/>" member? ] [ (after-attribute-name-state) ] }
1317 { [ dup f = ] [ (after-attribute-name-state) ] }
1318 { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
1319 { [ dup ascii-upper-alpha? ] [
1320 0x20 + reach push-attribute-name
1321 attribute-name-state
1323 { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
1324 { [ dup "\"'<" member? ] [
1325 unexpected-character-in-attribute-name
1326 reach push-attribute-name attribute-name-state
1328 [ reach push-attribute-name attribute-name-state ]
1331 : attribute-name-state ( document n/f string -- document n'/f string )
1332 take-char (attribute-name-state) ;
1335 : (after-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1337 { [ dup "\t\n\f\s" member? ] [ drop after-attribute-name-state ] }
1338 { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
1339 { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
1340 { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1341 { [ dup f = ] [ eof-in-tag ] }
1342 [ [ pick push-attribute ] dip (attribute-name-state) ]
1345 : after-attribute-name-state ( document n/f string -- document n'/f string )
1346 take-char (after-attribute-name-state) ;
1349 : (before-attribute-value-state) ( document n/f string ch/f -- document n'/f string )
1351 { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1352 { [ dup CHAR: " = ] [ drop attribute-value-double-quoted-state ] }
1353 { [ dup CHAR: ' = ] [ drop attribute-value-single-quoted-state ] }
1354 { [ dup CHAR: > = ] [ drop missing-attribute-value ] }
1355 [ (attribute-value-unquoted-state) ]
1358 : before-attribute-value-state ( document n/f string -- document n'/f string )
1359 take-char (before-attribute-value-state) ;
1362 : (attribute-value-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1364 { [ dup CHAR: " = ] [ drop after-attribute-value-quoted-state ] }
1365 { [ dup CHAR: & = ] [
1367 [ \ attribute-value-double-quoted-state >>return-state ] 2dip character-reference-state
1369 { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
1370 { [ dup f = ] [ eof-in-tag ] }
1371 [ reach push-attribute-value attribute-value-double-quoted-state ]
1374 : attribute-value-double-quoted-state ( document n/f string -- document n'/f string )
1375 take-char (attribute-value-double-quoted-state) ;
1378 : (attribute-value-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1380 { [ dup CHAR: ' = ] [ drop after-attribute-value-quoted-state ] }
1381 { [ dup CHAR: & = ] [
1382 drop [ \ attribute-value-single-quoted-state >>return-state ] 2dip
1383 character-reference-state
1385 { [ dup CHAR: \0 = ] [
1386 drop unexpected-null-character
1387 CHAR: replacement-character reach push-attribute-value
1389 { [ dup f = ] [ eof-in-tag ] }
1390 [ reach push-attribute-value attribute-value-single-quoted-state ]
1393 : attribute-value-single-quoted-state ( document n/f string -- document n'/f string )
1394 take-char (attribute-value-single-quoted-state) ;
1397 : (attribute-value-unquoted-state) ( document n/f string ch/f -- document n'/f string )
1399 { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1400 { [ dup CHAR: & = ] [
1402 [ \ attribute-value-unquoted-state >>return-state ] 2dip character-reference-state
1404 { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1405 { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-attribute-value ] }
1406 { [ dup "\"'<=`" member? ] [
1407 unexpected-character-in-unquoted-attribute-value
1408 reach push-attribute-value
1409 attribute-value-unquoted-state
1411 { [ dup f = ] [ eof-in-tag ] }
1412 [ reach push-attribute-value attribute-value-unquoted-state ]
1415 : attribute-value-unquoted-state ( document n/f string -- document n'/f string )
1416 take-char (attribute-value-unquoted-state) ;
1419 : (after-attribute-value-quoted-state) ( document n/f string ch/f -- document n'/f string )
1421 { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1422 { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
1423 { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1424 { [ dup f = ] [ eof-in-tag ] }
1425 [ missing-whitespace-between-attributes (before-attribute-name-state) ]
1428 : after-attribute-value-quoted-state ( document n/f string -- document n'/f string )
1429 take-char (after-attribute-value-quoted-state) ;
1432 : (self-closing-start-tag-state) ( document n/f string ch/f -- document n'/f string )
1434 { [ dup CHAR: > = ] [ drop pick [ set-self-closing ] [ emit-tag ] bi data-state ] }
1435 { [ dup f = ] [ eof-in-tag ] }
1436 [ unexpected-solidus-in-tag ]
1439 : self-closing-start-tag-state ( document n/f string -- document n'/f string )
1440 take-char (self-closing-start-tag-state) ;
1443 : (bogus-comment-state) ( document n/f string ch/f -- document n'/f string )
1445 { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
1446 { [ dup f = ] [ drop pick [ emit-comment-token ] [ emit-eof ] bi ] }
1447 { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-comment-token ] }
1448 [ reach push-comment-token bogus-comment-state ]
1451 : bogus-comment-state ( document n/f string -- document n'/f string )
1452 take-char (bogus-comment-state) ;
1455 : markup-declaration-open-state ( document n/f string -- document n'/f string )
1457 { [ "--" take-from? ] [ comment-start-state ] }
1458 { [ "DOCTYPE" take-from-insensitive? ] [ doctype-state ] }
1459 { [ "[CDATA[" take-from-insensitive? ] [ unimplemented* ] }
1461 incorrectly-opened-comment ! bogus-comment-state
1465 : (comment-start-state) ( document n/f string ch/f -- document n'/f string )
1467 { [ dup CHAR: - = ] [ drop comment-start-dash-state ] }
1468 { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment pick emit-comment-token data-state ] }
1472 : comment-start-state ( document n/f string -- document n'/f string )
1473 take-char (comment-start-state) ;
1476 : (comment-start-dash-state) ( document n/f string ch/f -- document n'/f string )
1478 { [ dup CHAR: - = ] [ drop comment-end-state ] }
1479 { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment ] }
1480 { [ dup f = ] [ eof-in-comment ] }
1481 [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
1484 : comment-start-dash-state ( document n/f string -- document n'/f string )
1485 take-char (comment-start-dash-state) ;
1488 : (comment-state) ( document n/f string ch/f -- document n'/f string )
1490 { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
1491 { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
1492 { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
1493 { [ dup f = ] [ eof-in-comment ] }
1494 [ reach push-comment-token comment-state ]
1497 : comment-state ( document n/f string -- document n'/f string )
1498 take-char (comment-state) ;
1501 : (comment-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1503 { [ dup CHAR: ! = ] [ reach push-comment-token comment-less-than-sign-bang-state ] }
1504 { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
1508 : comment-less-than-sign-state ( document n/f string -- document n'/f string )
1509 take-char (comment-less-than-sign-state) ;
1512 : (comment-less-than-sign-bang-state) ( document n/f string ch/f -- document n'/f string )
1514 { [ dup CHAR: - = ] [ reach push-comment-token comment-less-than-sign-bang-dash-state ] }
1518 : comment-less-than-sign-bang-state ( document n/f string -- document n'/f string )
1519 take-char (comment-less-than-sign-bang-state) ;
1522 : (comment-less-than-sign-bang-dash-state) ( document n/f string ch/f -- document n'/f string )
1524 { [ dup CHAR: - = ] [ drop comment-less-than-sign-bang-dash-dash-state ] }
1525 [ (comment-end-dash-state) ]
1528 : comment-less-than-sign-bang-dash-state ( document n/f string -- document n'/f string )
1529 take-char (comment-less-than-sign-bang-dash-state) ;
1532 : (comment-less-than-sign-bang-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1534 { [ dup CHAR: > = ] [ (comment-end-state) ] }
1535 { [ dup f = ] [ (comment-end-state) ] }
1536 [ nested-comment (comment-end-state) ]
1539 : comment-less-than-sign-bang-dash-dash-state ( document n/f string -- document n'/f string )
1540 take-char (comment-less-than-sign-bang-dash-dash-state) ;
1543 : (comment-end-dash-state) ( document n/f string ch/f -- document n'/f string )
1545 { [ dup CHAR: - = ] [ drop comment-end-state ] }
1546 { [ dup f = ] [ eof-in-comment ] }
1547 [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
1550 : comment-end-dash-state ( document n/f string -- document n'/f string )
1551 take-char (comment-end-dash-state) ;
1554 : (comment-end-state) ( document n/f string ch/f -- document n'/f string )
1556 { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
1557 { [ dup CHAR: ! = ] [ drop comment-end-bang-state ] }
1558 { [ dup CHAR: - = ] [ reach push-comment-token comment-end-state ] }
1559 { [ dup f = ] [ drop eof-in-comment pick [ emit-comment-token ] [ emit-eof ] bi ] }
1560 [ [ "--" reach push-all-comment-token ] dip (comment-state) ]
1563 : comment-end-state ( document n/f string -- document n'/f string )
1564 take-char (comment-end-state) ;
1567 : (comment-end-bang-state) ( document n/f string ch/f -- document n'/f string )
1569 { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
1570 { [ dup CHAR: > = ] [ drop incorrectly-closed-comment pick emit-comment-token data-state ] }
1571 { [ dup f = ] [ eof-in-comment ] }
1572 [ [ "--!" reach push-all-comment-token ] dip (comment-state) ]
1575 : comment-end-bang-state ( document n/f string -- document n'/f string )
1576 take-char (comment-end-bang-state) ;
1579 : (doctype-state) ( document n/f string ch/f -- document n'/f string )
1581 { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
1582 { [ dup CHAR: > = ] [ (before-doctype-name-state) ] }
1583 { [ dup f = ] [ drop eof-in-doctype pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1584 [ missing-whitespace-before-doctype-name ]
1587 : doctype-state ( document n/f string -- document n'/f string )
1588 take-char (doctype-state) ;
1591 : (before-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1593 { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
1594 { [ dup ascii-upper-alpha? ] [ 0x20 + reach new-doctype-from-ch doctype-name-state ] }
1595 { [ dup CHAR: \0 = ] [
1597 unexpected-null-character
1598 CHAR: replacement-character reach new-doctype-from-ch
1601 { [ dup CHAR: > = ] [
1602 drop missing-doctype-name
1603 pick [ new-doctype-with-quirks ] [ emit-doctype ] bi
1607 pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri
1609 [ reach new-doctype-from-ch doctype-name-state ]
1612 : before-doctype-name-state ( document n/f string -- document n'/f string )
1613 take-char (before-doctype-name-state) ;
1616 : (doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1618 { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
1619 { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1620 { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-doctype-name doctype-name-state ] }
1621 { [ dup CHAR: \0 = ] [
1622 drop unexpected-null-character
1623 CHAR: replacement-character pick push-doctype-name
1626 { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } ! force-quirks on for doctype
1627 [ reach push-doctype-name doctype-name-state ]
1630 : doctype-name-state ( document n/f string -- document n'/f string )
1631 take-char (doctype-name-state) ;
1634 : (after-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1636 { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
1637 { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1638 { [ dup f = ] [ eof-in-doctype ] }
1639 { [ [ "PUBLIC" take-from-insensitive? ] dip swap ] [ drop after-doctype-public-keyword-state ] }
1640 { [ [ "SYSTEM" take-from-insensitive? ] dip swap ] [ drop after-doctype-system-keyword-state ] }
1641 [ invalid-character-sequence-after-doctype-name ]
1644 : after-doctype-name-state ( document n/f string -- document n'/f string )
1645 take-char (after-doctype-name-state) ;
1648 : (after-doctype-public-keyword-state) ( document n/f string ch/f -- document n'/f string )
1650 { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
1651 { [ dup CHAR: " = ] [ missing-whitespace-after-doctype-public-keyword ] }
1652 { [ dup CHAR: ' = ] [ missing-whitespace-after-doctype-public-keyword ] }
1653 { [ dup CHAR: > = ] [ drop missing-doctype-public-identifier force-quirks data-state ] }
1654 { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1656 missing-quote-before-doctype-public-identifier
1657 [ reach force-quirks ] dip
1658 (bogus-doctype-state)
1662 : after-doctype-public-keyword-state ( document n/f string -- document n'/f string )
1663 take-char (after-doctype-public-keyword-state) ;
1666 : (before-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
1668 { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
1669 { [ dup CHAR: " = ] [
1670 drop pick initialize-doctype-public-identifier
1671 doctype-public-identifier-double-quoted-state
1673 { [ dup CHAR: ' = ] [
1674 drop pick initialize-doctype-public-identifier
1675 doctype-public-identifier-single-quoted-state
1677 { [ dup CHAR: > = ] [
1678 drop missing-doctype-public-identifier
1679 pick [ force-quirks ] [ emit-doctype ] bi
1682 { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1684 missing-quote-before-doctype-public-identifier
1685 [ reach force-quirks ] dip
1686 (bogus-doctype-state)
1690 : before-doctype-public-identifier-state ( document n/f string -- document n'/f string )
1691 take-char (before-doctype-public-identifier-state) ;
1694 : (doctype-public-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1696 { [ dup CHAR: " = ] [ drop after-doctype-public-identifier-state ] }
1697 { [ dup CHAR: \0 = ] [
1699 unexpected-null-character
1700 CHAR: replacement-character pick push-doctype-public-identifier
1701 doctype-public-identifier-double-quoted-state
1703 { [ dup CHAR: > = ] [
1704 drop abrupt-doctype-public-identifier
1705 pick [ force-quirks ] [ emit-doctype ] bi
1710 pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1712 [ reach push-doctype-public-identifier doctype-public-identifier-double-quoted-state ]
1715 : doctype-public-identifier-double-quoted-state ( document n/f string -- document n'/f string )
1716 take-char (doctype-public-identifier-double-quoted-state) ;
1719 : (doctype-public-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1721 { [ dup CHAR: ' = ] [ drop after-doctype-public-identifier-state ] }
1722 { [ dup CHAR: \0 = ] [
1724 unexpected-null-character
1725 CHAR: replacement-character pick push-doctype-public-identifier
1726 doctype-public-identifier-double-quoted-state
1728 { [ dup CHAR: > = ] [
1729 drop abrupt-doctype-public-identifier
1730 pick [ force-quirks ] [ emit-doctype ] bi
1735 pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1737 [ reach push-doctype-public-identifier doctype-public-identifier-single-quoted-state ]
1740 : doctype-public-identifier-single-quoted-state ( document n/f string -- document n'/f string )
1741 take-char (doctype-public-identifier-single-quoted-state) ;
1744 : (after-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
1746 { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1747 { [ dup CHAR: > = ] [
1748 drop pick emit-doctype
1751 { [ dup CHAR: " = ] [
1752 drop missing-whitespace-between-doctype-public-and-system-identifiers
1753 pick initialize-doctype-system-identifier
1754 doctype-system-identifier-double-quoted-state
1756 { [ dup CHAR: ' = ] [
1757 drop missing-whitespace-between-doctype-public-and-system-identifiers
1758 pick initialize-doctype-system-identifier
1759 doctype-system-identifier-single-quoted-state
1761 { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1763 missing-quote-before-doctype-system-identifier
1764 [ reach force-quirks ] dip
1765 (bogus-doctype-state)
1769 : after-doctype-public-identifier-state ( document n/f string -- document n'/f string )
1770 take-char (after-doctype-public-identifier-state) ;
1773 : (between-doctype-public-and-system-identifiers-state) ( document n/f string ch/f -- document n'/f string )
1775 { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1776 { [ dup CHAR: > = ] [
1777 drop pick emit-doctype
1780 { [ dup CHAR: " = ] [
1781 drop pick initialize-doctype-system-identifier
1782 doctype-system-identifier-double-quoted-state
1784 { [ dup CHAR: ' = ] [
1785 drop pick initialize-doctype-system-identifier
1786 doctype-system-identifier-single-quoted-state
1788 { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1790 missing-quote-before-doctype-system-identifier
1791 [ reach force-quirks ] dip
1792 (bogus-doctype-state)
1796 : between-doctype-public-and-system-identifiers-state ( document n/f string -- document n'/f string )
1797 take-char (between-doctype-public-and-system-identifiers-state) ;
1800 : (after-doctype-system-keyword-state) ( document n/f string ch/f -- document n'/f string )
1802 { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1803 { [ dup CHAR: " = ] [
1804 drop missing-whitespace-after-doctype-system-keyword
1805 pick initialize-doctype-system-identifier
1806 doctype-system-identifier-double-quoted-state
1808 { [ dup CHAR: ' = ] [
1809 drop missing-whitespace-after-doctype-system-keyword
1810 pick initialize-doctype-system-identifier
1811 doctype-system-identifier-single-quoted-state
1813 { [ dup CHAR: > = ] [
1814 drop missing-doctype-system-identifier
1815 pick [ force-quirks ] [ emit-doctype ] bi
1818 { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1820 missing-quote-before-doctype-system-identifier
1821 [ reach force-quirks ] dip
1822 (bogus-doctype-state)
1826 : after-doctype-system-keyword-state ( document n/f string -- document n'/f string )
1827 take-char (after-doctype-system-keyword-state) ;
1830 : (before-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
1832 { [ dup "\t\n\f\s" member? ] [ drop before-doctype-system-identifier-state ] }
1833 { [ dup CHAR: " = ] [
1834 drop pick initialize-doctype-system-identifier
1835 doctype-system-identifier-double-quoted-state
1837 { [ dup CHAR: ' = ] [
1838 drop pick initialize-doctype-system-identifier
1839 doctype-system-identifier-single-quoted-state
1841 { [ dup CHAR: > = ] [
1842 drop missing-doctype-system-identifier
1843 pick [ force-quirks ] [ emit-doctype ] bi
1846 { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1848 missing-quote-before-doctype-system-identifier
1849 [ reach force-quirks ] dip
1850 (bogus-doctype-state)
1854 : before-doctype-system-identifier-state ( document n/f string -- document n'/f string )
1855 take-char (before-doctype-system-identifier-state) ;
1858 : (doctype-system-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1860 { [ dup CHAR: " = ] [ drop after-doctype-system-identifier-state ] }
1861 { [ dup CHAR: \0 = ] [
1863 unexpected-null-character
1864 CHAR: replacement-character pick push-doctype-system-identifier
1865 doctype-system-identifier-double-quoted-state
1867 { [ dup CHAR: > = ] [
1868 drop abrupt-doctype-system-identifier
1869 pick [ force-quirks ] [ emit-doctype ] bi
1874 pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1876 [ reach push-doctype-system-identifier doctype-system-identifier-double-quoted-state ]
1879 : doctype-system-identifier-double-quoted-state ( document n/f string -- document n'/f string )
1880 take-char (doctype-system-identifier-double-quoted-state) ;
1883 : (doctype-system-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1885 { [ dup CHAR: ' = ] [ drop after-doctype-system-identifier-state ] }
1886 { [ dup CHAR: \0 = ] [
1888 unexpected-null-character
1889 CHAR: replacement-character pick push-doctype-system-identifier
1890 doctype-system-identifier-double-quoted-state
1892 { [ dup CHAR: > = ] [
1893 drop abrupt-doctype-system-identifier
1894 pick [ force-quirks ] [ emit-doctype ] bi
1899 pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1901 [ reach push-doctype-system-identifier doctype-system-identifier-single-quoted-state ]
1904 : doctype-system-identifier-single-quoted-state ( document n/f string -- document n'/f string )
1905 take-char (doctype-system-identifier-single-quoted-state) ;
1908 : (after-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
1910 { [ dup "\t\n\f\s" member? ] [ drop after-doctype-system-identifier-state ] }
1911 { [ dup CHAR: > = ] [
1912 drop pick emit-doctype
1915 { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1917 unexpected-character-after-doctype-system-identifier
1918 [ reach force-quirks ] dip
1919 (bogus-doctype-state)
1923 : after-doctype-system-identifier-state ( document n/f string -- document n'/f string )
1924 take-char (after-doctype-system-identifier-state) ;
1927 : (bogus-doctype-state) ( document n/f string ch/f -- document n'/f string )
1929 { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1930 { [ dup CHAR: \0 = ] [ drop unexpected-null-character bogus-doctype-state ] }
1931 { [ dup f = ] [ drop eof-in-doctype pick emit-eof ] }
1932 [ drop bogus-doctype-state ]
1935 : bogus-doctype-state ( document n/f string -- document n'/f string )
1936 take-char (bogus-doctype-state) ;
1939 : (cdata-section-state) ( document n/f string ch/f -- document n'/f string )
1941 { [ dup CHAR: ] = ] [ drop cdata-section-bracket-state ] }
1942 { [ dup f = ] [ drop eof-in-cdata pick emit-eof ] }
1943 [ reach emit-char cdata-section-state ]
1946 : cdata-section-state ( document n/f string -- document n'/f string )
1947 take-char (cdata-section-state) ;
1950 : (cdata-section-bracket-state) ( document n/f string ch/f -- document n'/f string )
1952 { [ dup CHAR: ] = ] [ drop cdata-section-end-state ] }
1953 [ [ CHAR: ] reach emit-char ] dip (cdata-section-state) ]
1956 : cdata-section-bracket-state ( document n/f string -- document n'/f string )
1957 take-char (cdata-section-bracket-state) ;
1960 : (cdata-section-end-state) ( document n/f string ch/f -- document n'/f string )
1962 { [ dup CHAR: ] = ] [ reach emit-char cdata-section-end-state ] }
1963 { [ dup CHAR: > = ] [ drop data-state ] }
1964 [ [ "]]" reach emit-string ] dip (cdata-section-state) ]
1967 : cdata-section-end-state ( document n/f string -- document n'/f string )
1968 take-char (cdata-section-end-state) ;
1971 : (character-reference-state) ( document n/f string ch/f -- document n'/f string )
1972 [ CHAR: & reach ch>new-temporary-buffer ] dip
1974 { [ dup ascii-alphanumeric? ] [ (named-character-reference-state) ] }
1975 { [ dup CHAR: # = ] [ reach push-temporary-buffer numeric-character-reference-state ] }
1976 [ reach flush-temporary-buffer (return-state) ]
1979 : character-reference-state ( document n/f string -- document n'/f string )
1980 take-char (character-reference-state) ;
1983 : (named-character-reference-state) ( document n/f string ch/f -- document n'/f string )
1984 reach push-temporary-buffer
1985 pick named-character-match?
1987 drop ! exact match, drop prefix match
1990 [ pick temporary-buffer-attribute? ]
1991 [ pick temporary-buffer>> ?last CHAR: ; = not ]
1992 [ 3dup peek-from { [ CHAR: = = ] [ ascii-alphanumeric? ] } 1|| ]
1995 flush-temporary-buffer
1998 pick [ take-named-character ] [ flush-temporary-buffer ] bi return-state
2002 [ named-character-reference-state ]
2003 [ pick flush-temporary-buffer ambiguous-ampersand-state ] if
2006 : named-character-reference-state ( document n/f string -- document n'/f string )
2007 take-char (named-character-reference-state) ;
2010 : (ambiguous-ampersand-state) ( document n/f string ch/f -- document n'/f string )
2012 { [ dup ascii-alphanumeric? ] [
2015 { [ dup CHAR: ; = ] [ unknown-named-character-reference (return-state) ] }
2019 : ambiguous-ampersand-state ( document n/f string -- document n'/f string )
2020 take-char (ambiguous-ampersand-state) ;
2023 : (numeric-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2025 { [ dup "xX" member? ] [ reach push-temporary-buffer hexadecimal-character-reference-start-state ] }
2026 [ (decimal-character-reference-start-state) ]
2029 : numeric-character-reference-state ( document n/f string -- document n'/f string )
2030 take-char (numeric-character-reference-state) ;
2033 : (hexadecimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
2035 { [ dup ascii-hex-digit? ] [ (hexadecimal-character-reference-state) ] }
2036 [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
2039 : hexadecimal-character-reference-start-state ( document n/f string -- document n'/f string )
2040 take-char (hexadecimal-character-reference-start-state) ;
2043 : (decimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
2045 { [ dup ascii-digit? ] [ (decimal-character-reference-state) ] }
2046 [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
2049 : decimal-character-reference-start-state ( document n/f string -- document n'/f string )
2050 take-char (decimal-character-reference-start-state) ;
2053 : (hexadecimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2055 { [ dup ascii-digit? ] [ unimplemented* ] }
2056 { [ dup ascii-upper-hex-digit? ] [ unimplemented* ] }
2057 { [ dup ascii-lower-hex-digit? ] [ unimplemented* ] }
2058 { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
2059 [ missing-semicolon-after-character-reference ]
2062 : hexadecimal-character-reference-state ( document n/f string -- document n'/f string )
2063 take-char (hexadecimal-character-reference-state) ;
2066 : (decimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2068 { [ dup ascii-digit? ] [ unimplemented* ] }
2069 { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
2070 [ missing-semicolon-after-character-reference ]
2073 : decimal-character-reference-state ( document n/f string -- document n'/f string )
2074 take-char (decimal-character-reference-state) ;
2077 : (numeric-character-reference-end-state) ( document n/f string ch/f -- document n'/f string )
2079 [ missing-semicolon-after-character-reference ]
2082 : numeric-character-reference-end-state ( document n/f string -- document n'/f string )
2083 take-char (numeric-character-reference-end-state) ;
2087 : parse-html5 ( string -- document )
2088 [ <document> 0 ] dip data-state 2drop ;