]> gitweb.factorcode.org Git - factor.git/blob - extra/html5/html5.factor
435f4c544ad81fd43e46d53533be6e05e8482354
[factor.git] / extra / html5 / html5.factor
1 ! Copyright (C) 2020 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators
4 combinators.short-circuit io io.encodings.utf8 io.files
5 json.reader kernel math math.order memoize modern.slices
6 prettyprint sequences sequences.extras strings suffix-arrays
7 words ;
8 IN: html5
9
10 : 1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip over push ; inline
11 : ?1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip [ over push ] when* ; inline
12
13 ! https://html.spec.whatwg.org/multipage/parsing.html#tokenization
14
15 ! https://infra.spec.whatwg.org/#namespaces
16 CONSTANT: html-namespace "http://www.w3.org/1999/xhtml"
17 CONSTANT: mathml-namespace "http://www.w3.org/1998/Math/MathML"
18 CONSTANT: svg-namespace "http://www.w3.org/2000/svg"
19 CONSTANT: xlink-namespace "http://www.w3.org/1999/xlink"
20 CONSTANT: xml-namespace "http://www.w3.org/XML/1998/namespace"
21 CONSTANT: xmlns-namespace "http://www.w3.org/2000/xmlns/"
22
23 DEFER: data-state
24 DEFER: (data-state)
25 DEFER: rcdata-state
26 DEFER: (rcdata-state)
27 DEFER: rawtext-state
28 DEFER: (rawtext-state)
29 DEFER: script-data-state
30 DEFER: (script-data-state)
31 DEFER: plaintext-state
32 DEFER: (plaintext-state)
33 DEFER: tag-open-state
34 DEFER: (tag-open-state)
35 DEFER: end-tag-open-state
36 DEFER: (end-tag-open-state)
37 DEFER: tag-name-state
38 DEFER: (tag-name-state)
39 DEFER: rcdata-less-than-sign-state
40 DEFER: (rcdata-less-than-sign-state)
41 DEFER: rcdata-end-tag-open-state
42 DEFER: (rcdata-end-tag-open-state)
43 DEFER: rcdata-end-tag-name-state
44 DEFER: (rcdata-end-tag-name-state)
45 DEFER: rawtext-less-than-sign-state
46 DEFER: (rawtext-less-than-sign-state)
47 DEFER: rawtext-end-tag-open-state
48 DEFER: (rawtext-end-tag-open-state)
49 DEFER: rawtext-end-tag-name-state
50 DEFER: (rawtext-end-tag-name-state)
51 DEFER: script-data-less-than-sign-state
52 DEFER: (script-data-less-than-sign-state)
53 DEFER: script-data-end-tag-open-state
54 DEFER: (script-data-end-tag-open-state)
55 DEFER: script-data-end-tag-name-state
56 DEFER: (script-data-end-tag-name-state)
57 DEFER: script-data-escape-start-state
58 DEFER: (script-data-escape-start-state)
59 DEFER: script-data-escape-start-dash-state
60 DEFER: (script-data-escape-start-dash-state)
61 DEFER: script-data-escaped-state
62 DEFER: (script-data-escaped-state)
63 DEFER: script-data-escaped-dash-state
64 DEFER: (script-data-escaped-dash-state)
65 DEFER: script-data-escaped-dash-dash-state
66 DEFER: (script-data-escaped-dash-dash-state)
67 DEFER: script-data-escaped-less-than-sign-state
68 DEFER: (script-data-escaped-less-than-sign-state)
69 DEFER: script-data-escaped-end-tag-open-state
70 DEFER: (script-data-escaped-end-tag-open-state)
71 DEFER: script-data-escaped-end-tag-name-state
72 DEFER: (script-data-escaped-end-tag-name-state)
73 DEFER: script-data-double-escape-start-state
74 DEFER: (script-data-double-escape-start-state)
75 DEFER: script-data-double-escaped-state
76 DEFER: (script-data-double-escaped-state)
77 DEFER: script-data-double-escaped-dash-state
78 DEFER: (script-data-double-escaped-dash-state)
79 DEFER: script-data-double-escaped-dash-dash-state
80 DEFER: (script-data-double-escaped-dash-dash-state)
81 DEFER: script-data-double-escaped-less-than-sign-state
82 DEFER: (script-data-double-escaped-less-than-sign-state)
83 DEFER: script-data-double-escape-end-state
84 DEFER: (script-data-double-escape-end-state)
85 DEFER: before-attribute-name-state
86 DEFER: (before-attribute-name-state)
87 DEFER: attribute-name-state
88 DEFER: (attribute-name-state)
89 DEFER: after-attribute-name-state
90 DEFER: (after-attribute-name-state)
91 DEFER: before-attribute-value-state
92 DEFER: (before-attribute-value-state)
93 DEFER: attribute-value-double-quoted-state
94 DEFER: (attribute-value-double-quoted-state)
95 DEFER: attribute-value-single-quoted-state
96 DEFER: (attribute-value-single-quoted-state)
97 DEFER: attribute-value-unquoted-state
98 DEFER: (attribute-value-unquoted-state)
99 DEFER: after-attribute-value-quoted-state
100 DEFER: (after-attribute-value-quoted-state)
101 DEFER: self-closing-start-tag-state
102 DEFER: (self-closing-start-tag-state)
103 DEFER: bogus-comment-state
104 DEFER: (bogus-comment-state)
105 DEFER: markup-declaration-open-state
106 DEFER: (markup-declaration-open-state)
107 DEFER: comment-start-state
108 DEFER: (comment-start-state)
109 DEFER: comment-start-dash-state
110 DEFER: (comment-start-dash-state)
111 DEFER: comment-state
112 DEFER: (comment-state)
113 DEFER: comment-less-than-sign-state
114 DEFER: (comment-less-than-sign-state)
115 DEFER: comment-less-than-sign-bang-state
116 DEFER: (comment-less-than-sign-bang-state)
117 DEFER: comment-less-than-sign-bang-dash-state
118 DEFER: (comment-less-than-sign-bang-dash-state)
119 DEFER: comment-less-than-sign-bang-dash-dash-state
120 DEFER: (comment-less-than-sign-bang-dash-dash-state)
121 DEFER: comment-end-dash-state
122 DEFER: (comment-end-dash-state)
123 DEFER: comment-end-state
124 DEFER: (comment-end-state)
125 DEFER: comment-end-bang-state
126 DEFER: (comment-end-bang-state)
127 DEFER: doctype-state
128 DEFER: (doctype-state)
129 DEFER: before-doctype-name-state
130 DEFER: (before-doctype-name-state)
131 DEFER: doctype-name-state
132 DEFER: (doctype-name-state)
133 DEFER: after-doctype-name-state
134 DEFER: (after-doctype-name-state)
135 DEFER: after-doctype-public-keyword-state
136 DEFER: (after-doctype-public-keyword-state)
137 DEFER: before-doctype-public-identifier-state
138 DEFER: (before-doctype-public-identifier-state)
139 DEFER: doctype-public-identifier-double-quoted-state
140 DEFER: (doctype-public-identifier-double-quoted-state)
141 DEFER: doctype-public-identifier-single-quoted-state
142 DEFER: (doctype-public-identifier-single-quoted-state)
143 DEFER: after-doctype-public-identifier-state
144 DEFER: (after-doctype-public-identifier-state)
145 DEFER: between-doctype-public-and-system-identifiers-state
146 DEFER: (between-doctype-public-and-system-identifiers-state)
147 DEFER: after-doctype-system-keyword-state
148 DEFER: (after-doctype-system-keyword-state)
149 DEFER: before-doctype-system-identifier-state
150 DEFER: (before-doctype-system-identifier-state)
151 DEFER: doctype-system-identifier-double-quoted-state
152 DEFER: (doctype-system-identifier-double-quoted-state)
153 DEFER: doctype-system-identifier-single-quoted-state
154 DEFER: (doctype-system-identifier-single-quoted-state)
155 DEFER: after-doctype-system-identifier-state
156 DEFER: (after-doctype-system-identifier-state)
157 DEFER: bogus-doctype-state
158 DEFER: (bogus-doctype-state)
159 DEFER: cdata-section-state
160 DEFER: (cdata-section-state)
161 DEFER: cdata-section-bracket-state
162 DEFER: (cdata-section-bracket-state)
163 DEFER: cdata-section-end-state
164 DEFER: (cdata-section-end-state)
165 DEFER: character-reference-state
166 DEFER: (character-reference-state)
167 DEFER: named-character-reference-state
168 DEFER: (named-character-reference-state)
169 DEFER: ambiguous-ampersand-state
170 DEFER: (ambiguous-ampersand-state)
171 DEFER: numeric-character-reference-state
172 DEFER: (numeric-character-reference-state)
173 DEFER: hexadecimal-character-reference-start-state
174 DEFER: (hexadecimal-character-reference-start-state)
175 DEFER: decimal-character-reference-start-state
176 DEFER: (decimal-character-reference-start-state)
177 DEFER: hexadecimal-character-reference-state
178 DEFER: (hexadecimal-character-reference-state)
179 DEFER: decimal-character-reference-state
180 DEFER: (decimal-character-reference-state)
181 DEFER: numeric-character-reference-end-state
182 DEFER: (numeric-character-reference-end-state)
183
184
185 ERROR: unimplemented string ;
186 ERROR: unimplemented* ;
187
188 ! Errors: https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
189 ERROR: abrupt-closing-of-empty-comment ;
190 ERROR: abrupt-doctype-public-identifier ;
191 ERROR: abrupt-doctype-system-identifier ;
192 ERROR: absence-of-digits-in-numeric-character-reference ;
193 ERROR: cdata-in-html-content ;
194 ERROR: character-reference-outside-unicode-range ;
195 ERROR: control-character-in-input-stream ;
196 ERROR: control-character-reference ;
197 ERROR: end-tag-with-attributes ;
198 ERROR: duplicate-attribute ;
199 ERROR: end-tag-with-trailing-solidus ;
200 ERROR: eof-before-tag-name ;
201 ERROR: eof-in-cdata ;
202 ERROR: eof-in-comment ;
203 ERROR: eof-in-doctype ;
204 ERROR: eof-in-script-html-comment-like-text ;
205 ERROR: eof-in-tag ;
206 ERROR: incorrectly-closed-comment ;
207 ERROR: incorrectly-opened-comment ;
208 ERROR: invalid-character-sequence-after-doctype-name ;
209 ERROR: invalid-first-character-of-tag-name ;
210 ERROR: missing-attribute-value ;
211 ERROR: missing-doctype-name ;
212 ERROR: missing-doctype-public-identifier ;
213 ERROR: missing-doctype-system-identifier ;
214 ERROR: missing-end-tag-name ;
215 ERROR: missing-quote-before-doctype-public-identifier ;
216
217 ERROR: missing-quote-before-doctype-system-identifier ;
218 ERROR: missing-semicolon-after-character-reference ;
219 ERROR: missing-whitespace-after-doctype-public-keyword ;
220 ERROR: missing-whitespace-after-doctype-system-keyword ;
221 ERROR: missing-whitespace-before-doctype-name ;
222 ERROR: missing-whitespace-between-attributes ;
223 ERROR: missing-whitespace-between-doctype-public-and-system-identifiers ;
224 ERROR: nested-comment ;
225 ERROR: noncharacter-character-reference ;
226 ERROR: noncharacter-in-input-stream ;
227 ERROR: non-void-html-element-start-tag-with-trailing-solidus ;
228 ERROR: null-character-reference ;
229 ERROR: surrogate-character-reference ;
230 ERROR: surrogate-in-input-stream ;
231 ERROR: unexpected-character-after-doctype-system-identifier ;
232 ERROR: unexpected-character-in-attribute-name ;
233 ERROR: unexpected-character-in-unquoted-attribute-value ;
234 ERROR: unexpected-equals-sign-before-attribute-name ;
235 ERROR: unexpected-null-character ;
236 ERROR: unexpected-question-mark-instead-of-tag-name ;
237 ERROR: unexpected-solidus-in-tag ;
238 ERROR: unknown-named-character-reference ;
239
240 ! Tree insertion modes
241 SINGLETONS: initial-mode before-html-mode before-head-mode
242 in-head-mode in-head-noscript-mode after-head-mode
243 in-body-mode text-mode in-table-mode in-table-text-mode
244 in-caption-mode in-column-group-mode in-table-body-mode
245 in-row-mode in-cell-mode in-select-mode in-select-in-table-mode in-template-mode
246 after-body-mode in-frameset-mode after-frameset-mode after-after-body-mode
247 after-after-frameset-mode ;
248
249 TUPLE: document
250 quirks-mode?
251 limited-quirks-mode?
252 iframe-srcdoc?
253 scripting? ! set in constructor
254 frameset-ok? ! frameset-ok? but we want default to f
255 fostering-parent?
256 tree
257 tree-doctype
258 head-element-pointer ! set during insertion time
259 parser-cannot-change-mode-flag
260 insertion-mode
261 original-insertion-mode
262 last
263 node
264 context
265 doctype
266 tag
267 end-tag
268
269 tag-name
270 end-tag-name
271 attribute-name
272 attribute-value
273 temporary-buffer
274 comment-token
275 open-elements
276 return-state ;
277
278 ! "reset the insertion mode appropriately"
279 ! : reset-insertion-mode ( document -- document )
280 !     f >>last
281 !     dup open-elements>> ?last >>node
282 !     dup [ open-elements>> ?first ] [ node>> ] bi = [
283 !         t >>last dup node>> >>context
284 !     ] when
285 !     dup node>> {
286 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
287 !         { [
288 !             dup name>> >lower { "td" "th" } member?
289 !             pick last>> f = and
290 !         ] [ drop in-select >>insertion-mode ] }
291 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
292 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
293 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
294 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
295 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
296 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
297 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
298 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
299 !         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
300 !     } cond
301 !     ;
302
303 : temporary-buffer-attribute? ( document -- ? )
304     return-state>>
305     {
306         attribute-value-unquoted-state
307         attribute-value-single-quoted-state
308         attribute-value-double-quoted-state
309     } member? ;
310
311 ! name, public/system identifier should not be empty strings
312 ! until the state machine demands it
313 TUPLE: doctype
314     name
315     public-identifier
316     system-identifier
317     quirks? ;
318
319 : <doctype> ( -- doctype )
320     doctype new ; inline
321
322 : new-doctype-from-ch ( ch document -- )
323     [
324         doctype new
325             swap ?1sbuf >>name
326     ] dip doctype<< ; inline
327
328 : new-doctype-with-quirks ( document -- )
329     <doctype> t >>quirks? >>doctype drop ;
330
331 TUPLE: tag self-closing? name attributes children end-tag ;
332
333 : <tag> ( -- tag )
334     tag new
335         SBUF" " clone >>name
336         V{ } clone >>attributes
337         V{ } clone >>children ;
338
339 TUPLE: end-tag self-closing? name attributes ;
340
341 : <end-tag> ( -- tag )
342     end-tag new
343         SBUF" " clone >>name
344         V{ } clone >>attributes ;
345
346 : new-tag ( document -- )
347     <tag> >>tag drop ;
348
349 : new-end-tag ( document -- )
350     <end-tag> >>tag drop ;
351
352 : set-self-closing ( document -- )
353     tag>> t >>self-closing? drop ;
354
355 : <document> ( -- document )
356     document new
357         V{ } clone >>tree
358         initial-mode >>insertion-mode
359         <doctype> >>doctype
360         t >>frameset-ok?
361         ! SBUF" " clone >>tag-name
362         SBUF" " clone >>attribute-name
363         SBUF" " clone >>attribute-value
364         SBUF" " clone >>temporary-buffer
365         SBUF" " clone >>comment-token
366         V{ } clone >>open-elements
367     ; inline
368
369 TUPLE: comment open payload close ;
370
371 : <comment> ( payload -- comment )
372     comment new
373         swap >>payload ; inline
374
375 : force-quirks ( document -- )
376     doctype>> t >>quirks? drop ;
377
378 : initialize-doctype-name ( document -- )
379     [ SBUF" " clone ] dip doctype>> name<< ;
380
381 : initialize-doctype-public-identifier ( document -- )
382     [ SBUF" " clone ] dip doctype>> public-identifier<< ;
383
384 : initialize-doctype-system-identifier ( document -- )
385     [ SBUF" " clone ] dip doctype>> system-identifier<< ;
386
387 : push-doctype-name ( ch document -- )
388     doctype>> name>> push ;
389
390 : push-doctype-public-identifier ( ch document -- )
391     doctype>> public-identifier>> push ;
392
393 : push-doctype-system-identifier ( ch document -- )
394     doctype>> system-identifier>> push ;
395
396 ! XXX: not html5 spec, fix
397 ERROR: unmatched-closing-tag-error stack tag ;
398
399 : unclosed-tag? ( obj -- ? )
400     { [ tag? ] [ end-tag>> not ] } 1&& ; inline
401
402 :: find-matching-tag ( name stack -- seq )
403     stack [ { [ unclosed-tag? ] [ name>> name = ] } 1&& ] find-last drop [
404         stack swap shorten*
405     ] [
406         stack name unmatched-closing-tag-error
407     ] if* ;
408
409 DEFER: tree-insert
410 GENERIC: tree-insert* ( document obj insertion-mode -- document )
411
412 : limited-quirks-mode? ( doctype -- ? )
413     {
414         [ public-identifier>> "-//W3C//DTD XHTML 1.0 Frameset//" head? ]
415         [ public-identifier>> "-//W3C//DTD XHTML 1.0 Transitional//" head? ]
416         [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Frameset//" head? ] } 1&& ]
417         [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Transitional//" head? ] } 1&& ]
418     } 1|| ;
419
420 ! https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode
421 M: initial-mode tree-insert*
422     drop {
423         { [ dup "\t\n\f\r\s" member? ] [ drop ] }
424         { [ dup doctype? ] [
425             >>tree-doctype before-html-mode >>insertion-mode
426         ] }
427         { [ dup comment? ] [ over tree>> push ] }
428         { [ dup tag? ] [ over tree>> push ] }
429         { [ dup end-tag? ] [
430             dup name>> pick tree>> find-matching-tag
431             unclip
432                 swap >>children
433                 swap >>end-tag
434             over tree>> push
435         ] }
436         [
437             over iframe-srcdoc?>> [
438                 over parser-cannot-change-mode-flag>> [
439                     [ t >>quirks-mode? ] dip
440                 ] unless
441             ] [
442                 "must be iframe-srcdoc here" throw
443             ] if
444             ! reprocess the token
445             before-html-mode >>insertion-mode tree-insert
446         ]
447     } cond ;
448
449 ! https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
450 M: before-html-mode tree-insert*
451     drop {
452         { [ dup doctype? ] [ drop ] }
453         { [ dup comment? ] [ over tree>> push ] }
454         { [ dup "\t\n\f\r\s" member? ] [ drop ] }
455         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
456             over tree>> push
457             before-head-mode >>insertion-mode
458         ] }
459         ! these tags are handled in the default case
460         { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
461             ! error end-tag, ignore
462             drop
463         ] }
464         [
465             ! Create missing html tag and reprocess the token
466             <tag> "html" >>name pick tree>> push
467             before-head-mode >>insertion-mode tree-insert
468         ]
469     } cond ;
470
471 M: before-head-mode tree-insert*
472     drop {
473         { [ dup "\t\n\f\r\s" member? ] [ drop ] }
474         { [ dup comment? ] [ over tree>> push ] }
475         { [ dup doctype? ] [ drop ] }
476         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
477             ! XXX: in-body-mode rules here for html tag
478             ! B
479             ! over tree>> push
480             ! before-head-mode >>insertion-mode
481             "handle html in-body-mode here" throw
482         ] }
483         { [ dup { [ tag? ] [ name>> "head" = ] } 1&& ] [
484             [ swap tree>> push ]
485             [ >>head-element-pointer drop ]
486             [ drop in-head-mode >>insertion-mode ] 2tri
487         ] }
488         ! these tags are handled in the default case
489         { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
490             ! error end-tag, ignore
491             drop
492         ] }
493         ! ignore tag
494         { [ dup tag? ] [ drop ] }
495         [
496             ! Create missing html tag and reprocess the token
497             <tag>
498             [ "head" >>name pick tree>> push ]
499             [ >>head-element-pointer ] bi
500             in-head-mode >>insertion-mode tree-insert
501         ]
502     } cond ;
503
504 M: in-head-mode tree-insert*
505     drop {
506         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
507         { [ dup comment? ] [ over tree>> push ] }
508         { [ dup doctype? ] [ drop ] }
509         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
510             ! XXX: in-body-mode rules here for html tag
511             ! B
512             ! over tree>> push
513             ! before-head-mode >>insertion-mode
514             "handle html in-body-mode here" throw
515             unimplemented*
516         ] }
517         { [ dup { [ tag? ] [ name>> { "base" "basefont" "bgsound" "link" } member? ] } 1&& ] [
518             ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
519             unimplemented*
520         ] }
521         { [ dup { [ tag? ] [ name>> "meta" = ] } 1&& ] [
522             unimplemented*
523         ] }
524         { [ dup { [ tag? ] [ name>> "title" = ] } 1&& ] [
525             ! https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm
526             "insert title node" throw
527             unimplemented*
528         ] }
529         { [
530             dup {
531                 [ { [ tag? ] [ name>> "noscript" = ] [ scripting?>> ] } 1&& ]
532                 [ { [ tag? ] [ name>> { "noframes" "style" } member? ] } 1&& ]
533             } 1||
534         ] [
535             ! https://html.spec.whatwg.org/multipage/parsing.html#generic-raw-text-element-parsing-algorithm
536             unimplemented*
537         ] }
538         { [ dup { [ tag? ] [ name>> "noscript" = ] [ scripting?>> not ] } 1&& ] [
539             unimplemented*
540             over tree>> push
541             in-head-noscript-mode >>insertion-mode
542         ] }
543         { [ dup { [ tag? ] [ name>> "script" = ] } 1&& ] [
544             unimplemented*
545             text-mode >>insertion-mode
546         ] }
547         { [ dup { [ end-tag? ] [ name>> "head" = ] } 1&& ] [
548             over tree>> last end-tag<<
549             after-head-mode >>insertion-mode
550         ] }
551         { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? ] } 1&& ] [
552             ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
553             unimplemented*
554         ] }
555         { [ dup { [ tag? ] [ name>> "template" = ] } 1&& ] [
556             unimplemented*
557             in-template-mode >>insertion-mode
558         ] }
559         { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
560             unimplemented*
561         ] }
562         ! XXX: revisit this
563         { [ dup {
564             [ { [ tag? ] [ name>> "head" = ] } 1&& ]
565             [ end-tag? ]
566             } 1|| ] [ drop "ignore here" throw ] }
567         [
568             ! end head tag should be here, pop off, reprocess
569             over tree>> pop swap >>end-tag
570             after-head-mode >>insertion-mode "omg" throw
571         ]
572     } cond ;
573
574 M: in-head-noscript-mode tree-insert* drop unimplemented* ;
575
576 M: after-head-mode tree-insert*
577     drop {
578         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
579         { [ dup comment? ] [ over tree>> push ] }
580         { [ dup doctype? ] [ drop ] }
581         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
582             ! XXX: in-body-mode rules here for html tag
583             ! B
584             ! over tree>> push
585             ! before-head-mode >>insertion-mode
586             "handle html in-body-mode here" throw
587             unimplemented*
588         ] }
589         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
590             unimplemented*
591         ] }
592         { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [
593             over tree>> push
594             f >>frameset-ok?
595             in-body-mode >>insertion-mode
596         ] }
597         { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [
598             unimplemented*
599         ] }
600         { [ dup { [ tag? ] [
601             name>> {
602                 "base" "basefont" "bgsound" "link" "meta"
603                 "noframes" "script" "style" "template" "title"
604             } member? ] } 1&&
605         ] [
606             unimplemented*
607         ] }
608         { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
609             unimplemented*
610         ] }
611         ! same as default case
612         ! { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ] [
613         !     unimplemented*
614         ! ] }
615         { [
616             dup {
617                 [ { [ tag? ] [ name>> "head" = ] } 1&& ]
618                 [ { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ]
619             } 1||
620         ] [
621             "omg revisit this" throw
622             unimplemented*
623         ] }
624         [
625             B
626             <tag> "body" >>name pick tree>> push
627             in-body-mode >>insertion-mode tree-insert
628         ]
629     } cond ;
630
631 M: in-body-mode tree-insert*
632     drop {
633         { [ dup CHAR: \0 = ] [ drop ] }
634         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
635         { [ dup comment? ] [ over tree>> push ] }
636         { [ dup doctype? ] [ drop ] }
637         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ drop ] }
638         { [ dup {
639             [
640                 {
641                     [ tag? ]
642                     [
643                         name>> {
644                             "base" "basefont" "bgsound" "link" "meta"
645                             "noframes" "script" "style" "template" "title"
646                         } member?
647                     ]
648                 } 1&&
649             ] [
650                 { [ end-tag? ] [ name>> "template" = ] } 1&&
651             ] } 1||
652         ] [
653             unimplemented*
654         ] }
655         ! XXX: parse error
656         { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [ drop unimplemented* ] }
657         { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [ drop unimplemented* ] }
658         ! XXX: eof
659         ! { [ ] [ ] }
660         { [ dup { [ end-tag? ] [ name>> "body" = ] } 1&& ] [
661             "body" pick tree>> find-matching-tag
662             unclip
663                 swap >>children
664                 swap >>end-tag
665             over tree>> push
666
667             after-body-mode >>insertion-mode
668          ] }
669         { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [ drop unimplemented* ] }
670         ! { [ ] [ ] }
671         [
672             unimplemented*
673         ]
674     } cond ;
675
676 M: text-mode tree-insert* drop unimplemented* ;
677 M: in-table-mode tree-insert* drop unimplemented* ;
678 M: in-table-text-mode tree-insert* drop unimplemented* ;
679 M: in-caption-mode tree-insert* drop unimplemented* ;
680 M: in-column-group-mode tree-insert* drop unimplemented* ;
681 M: in-table-body-mode tree-insert* drop unimplemented* ;
682 M: in-row-mode tree-insert* drop unimplemented* ;
683 M: in-cell-mode tree-insert* drop unimplemented* ;
684 M: in-select-mode tree-insert* drop unimplemented* ;
685 M: in-select-in-table-mode tree-insert* drop unimplemented* ;
686 M: in-template-mode tree-insert* drop unimplemented* ;
687 M: after-body-mode tree-insert*
688     drop {
689         { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
690         { [ dup comment? ] [ over tree>> push ] }
691         { [ dup doctype? ] [ drop ] }
692         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
693             unimplemented*
694          ] }
695         { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [
696             ! XXX: make this a function
697             "html" pick tree>> find-matching-tag
698             unclip
699                 swap >>children
700                 swap >>end-tag
701             over tree>> push
702
703             after-after-body-mode >>insertion-mode
704         ] }
705         [
706             unimplemented*
707         ]
708     } cond ;
709 M: in-frameset-mode tree-insert* drop unimplemented* ;
710 M: after-frameset-mode tree-insert* drop unimplemented* ;
711
712 M: after-after-body-mode tree-insert*
713     drop {
714         { [ dup comment? ] [ over tree>> push ] }
715         { [ dup doctype? ] [ unimplemented*  ] }
716         { [ dup "\t\n\f\r\s" member? ] [ unimplemented*  ] }
717         { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ unimplemented* ] }
718         ! eof
719         { [ dup f = ] [ drop ] }
720         [
721             ! XXX: parse error
722             [ in-body-mode >>insertion-mode ] dip tree-insert
723         ]
724     } cond ;
725
726 M: after-after-frameset-mode tree-insert* drop unimplemented* ;
727
728 : tree-insert ( document obj -- document )
729     over insertion-mode>> tree-insert* ;
730
731 MEMO: load-entities ( -- assoc )
732     "vocab:html5/entities.json" utf8 file-contents json> ;
733
734 MEMO: entities-suffix-array ( -- assoc )
735     load-entities keys >suffix-array ;
736
737 : lookup-entity ( string -- entity/string ? )
738     load-entities ?at ;
739
740 : named-character-match? ( document -- prefix? exact? )
741     temporary-buffer>>
742     [ entities-suffix-array query f like ]
743     [ last CHAR: ; = ] bi ;
744
745 ERROR: unknown-named-entity entity ;
746 : take-named-character ( document -- )
747     dup
748     temporary-buffer>> >string lookup-entity [
749         "characters" of
750         SBUF" " clone-like >>temporary-buffer drop
751     ] [
752         unknown-named-entity
753     ] if ;
754
755 ! XXX: remove the tag>> name>> push part
756 : push-tag-name ( ch document -- )
757     [ tag>> name>> push ]
758     [
759         2drop ! tag-name>> push
760     ] 2bi ;
761 : push-attribute-name ( ch document -- ) attribute-name>> push ;
762 : push-attribute-value ( ch document -- ) attribute-value>> push ;
763 : push-comment-token ( ch document -- ) comment-token>> push ;
764 : push-all-comment-token ( string document -- ) comment-token>> push-all ;
765
766 ERROR: invalid-return-state obj ;
767 : check-return-state ( obj -- return-state )
768     dup word? [ invalid-return-state ] unless ;
769
770 : current-attribute ( document -- attribute/f )
771     [ attribute-name>> >string f like ]
772     [ attribute-value>> >string f like ] bi
773     2dup or [ 2array ] [ 2drop f ] if ;
774
775 : push-when ( obj/f seq -- )
776     over [ push ] [ 2drop ] if ; inline
777
778 : reset-attribute ( document -- )
779     SBUF" " clone >>attribute-name
780     SBUF" " clone >>attribute-value drop ;
781
782 : push-attribute ( document -- )
783     [ current-attribute ]
784     [ tag>> attributes>> push-when ]
785     [ reset-attribute ] tri ;
786
787 : emit-eof ( document -- )
788     "emit-eof" print
789     f tree-insert drop ;
790 : emit-char ( char document -- ) drop "emit-char: " write 1string . ;
791 : emit-string ( char document -- ) drop "emit-string: " write . ;
792 : emit-tag ( document -- )
793     "emit-tag: " write
794     {
795         [ tag>> [ name>> >string ] [ name<< ] bi ]
796         [ push-attribute ]
797         [ tag>> . ]
798         [ dup tag>> tree-insert drop ]
799         [ f >>tag drop ]
800     } cleave ;
801 : emit-end-tag ( document -- )
802     "emit-end-tag: " write
803     [ tag>> . ]
804     [ f >>tag drop ] bi ;
805 : emit-comment-token ( document -- )
806     "emit-comment-token: " write
807     {
808         [ comment-token>> >string . ]
809         [ dup comment-token>> >string <comment> tree-insert drop ]
810         [ SBUF" " clone >>comment-token drop ]
811     } cleave ;
812 : emit-doctype ( document -- )
813     "emit-doctype: " write dup doctype>> .
814     {
815         [ doctype>> [ >string ] change-name drop ]
816         [
817             ! XXX: handle iframe srcdoc document
818             dup { [ doctype>> name>> "html" = not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
819                 t >>quirks-mode?
820             ] [
821                 dup { [ iframe-srcdoc?>> not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
822                     dup doctype>> limited-quirks-mode? [ t >>limited-quirks-mode? ] when
823                 ] when
824             ] if
825             drop
826         ]
827         [ dup doctype>> tree-insert drop ]
828         [ f >>doctype drop ]
829     } cleave ;
830
831 : reset-temporary-buffer ( document -- ) SBUF" " clone temporary-buffer<< ;
832 : ch>new-temporary-buffer ( ch document -- ) [ 1sbuf ] dip temporary-buffer<< ;
833 : string>new-temporary-buffer ( string document -- ) [ SBUF" " clone-like ] dip temporary-buffer<< ;
834 : temporary-buffer-last ( document -- ch/f ) temporary-buffer>> ?last ;
835 : push-temporary-buffer ( ch document -- ) temporary-buffer>> push ;
836 : push-all-temporary-buffer ( string document -- ) temporary-buffer>> push-all ;
837
838 : flush-temporary-buffer ( document -- )
839     "flush-temporary-buffer: " write
840     [ [ temporary-buffer>> ] keep [ emit-char ] curry each ]
841     [ SBUF" " clone >>temporary-buffer drop ] bi ;
842
843 : emit-temporary-buffer-with ( string document -- )
844     [ temporary-buffer>> push-all ]
845     [ flush-temporary-buffer ] bi ;
846
847 ! check if matches open tag
848 : appropriate-end-tag-token? ( document -- ? )
849     drop f ;
850
851 : ascii-upper-alpha? ( ch -- ? ) [ CHAR: A CHAR: Z between? ] [ f ] if* ; inline
852 : ascii-lower-alpha? ( ch -- ? ) [ CHAR: a CHAR: z between? ] [ f ] if* ; inline
853 : ascii-upper-hex-digit? ( ch -- ? ) [ CHAR: A CHAR: F between? ] [ f ] if* ; inline
854 : ascii-lower-hex-digit? ( ch -- ? ) [ CHAR: a CHAR: f between? ] [ f ] if* ; inline
855 : ascii-hex-alpha? ( ch -- ? ) { [ ascii-upper-hex-digit? ] [ ascii-lower-hex-digit? ] } 1|| ; inline
856
857 : ascii-digit? ( ch/f -- ? ) [ CHAR: 0 CHAR: 9 between? ] [ f ] if* ;
858 : ascii-alpha? ( ch/f -- ? ) { [ ascii-lower-alpha? ] [ ascii-upper-alpha? ] } 1|| ;
859 : ascii-alphanumeric? ( ch/f -- ? ) { [ ascii-alpha? ] [ ascii-digit? ] } 1|| ;
860 : ascii-hex-digit? ( ch/f -- ? ) { [ ascii-digit? ] [ ascii-hex-alpha? ] } 1|| ;
861
862 : (return-state) ( document n/f string ch/f -- document n'/f string )
863     reach [ f ] change-return-state drop check-return-state
864     execute( document n/f string ch/f -- document n'/f string ) ;
865
866 : return-state ( document n/f string -- document n'/f string )
867     pick [ f ] change-return-state drop check-return-state
868     execute( document n/f string -- document n'/f string ) ;
869
870 : (data-state) ( document n/f string ch/f -- document n'/f string )
871     {
872         { [ dup CHAR: & = ] [ drop [ \ data-state >>return-state ] 2dip character-reference-state ] }
873         { [ dup CHAR: < = ] [ drop tag-open-state ] }
874         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
875         { [ dup f = ] [ drop pick emit-eof ] }
876         [ reach emit-char data-state ]
877     } cond ;
878
879 : data-state ( document n/f string -- document n'/f string )
880     take-char (data-state) ;
881
882
883 : (rcdata-state) ( document n/f string ch/f -- document n'/f string )
884     {
885         { [ dup CHAR: & = ] [ drop [ \ rcdata-state >>return-state ] 2dip character-reference-state ] }
886         { [ dup CHAR: < = ] [ drop rcdata-less-than-sign-state ] }
887         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
888         { [ dup f = ] [ drop pick emit-eof ] }
889         [ reach emit-char rcdata-state ]
890     } cond ;
891
892 : rcdata-state ( document n/f string -- document n'/f string )
893     take-char (rcdata-state) ;
894
895
896 : (rawtext-state) ( document n/f string ch/f -- document n'/f string )
897     {
898         { [ dup CHAR: < = ] [ drop rawtext-less-than-sign-state ] }
899         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
900         { [ dup f = ] [ drop pick emit-eof ] }
901         [ reach emit-char rawtext-state ]
902     } cond ;
903
904 : rawtext-state ( document n/f string -- document n'/f string )
905     take-char (rawtext-state) ;
906
907
908 : (script-data-state) ( document n/f string ch/f -- document n'/f string )
909     {
910         { [ dup CHAR: < = ] [ drop script-data-less-than-sign-state ] }
911         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
912         { [ dup f = ] [ drop pick emit-eof ] }
913         [ reach emit-char script-data-state ]
914     } cond ;
915
916 : script-data-state ( document n/f string -- document n'/f string )
917     take-char (script-data-state) ;
918
919
920 : (plaintext-state) ( document n/f string ch/f -- document n'/f string )
921     {
922         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
923         { [ dup f = ] [ drop pick emit-eof ] }
924         [ reach emit-char plaintext-state ]
925     } cond ;
926
927 : plaintext-state ( document n/f string -- document n'/f string )
928     take-char (plaintext-state) ;
929
930
931 : (tag-open-state) ( document n/f string ch/f -- document n'/f string )
932     {
933         { [ dup ascii-alpha? ] [ reach new-tag (tag-name-state) ] }
934         { [ dup CHAR: ! = ] [ drop markup-declaration-open-state ] }
935         { [ dup CHAR: / = ] [ drop end-tag-open-state ] }
936         { [ dup CHAR: ? = ] [ unexpected-question-mark-instead-of-tag-name ] }
937         { [ dup f = ] [ eof-before-tag-name ] }
938         [ invalid-first-character-of-tag-name ]
939     } cond ;
940
941 : tag-open-state ( document n/f string -- document n'/f string )
942     take-char (tag-open-state) ;
943
944
945 : (end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
946     {
947         { [ dup ascii-alpha? ] [ reach new-end-tag (tag-name-state) ] }
948         { [ dup CHAR: > = ] [ missing-end-tag-name ] }
949         { [ dup f = ] [ eof-before-tag-name ] }
950         [ invalid-first-character-of-tag-name ]
951     } cond ;
952
953 : end-tag-open-state ( document n/f string -- document n'/f string )
954     take-char (end-tag-open-state) ;
955
956
957 : (tag-name-state) ( document n/f string ch/f -- document n'/f string )
958     {
959         { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-tag-name tag-name-state ] }
960         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
961         { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
962         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
963         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
964         { [ dup f = ] [ eof-before-tag-name ] }
965         [ reach push-tag-name tag-name-state ]
966     } cond ;
967
968 : tag-name-state ( document n/f string -- document n'/f string )
969     take-char (tag-name-state) ;
970
971
972 : (rcdata-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
973     {
974         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rcdata-end-tag-open-state ] }
975         [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
976     } cond ;
977
978 : rcdata-less-than-sign-state ( document n/f string -- document n'/f string )
979     take-char (rcdata-less-than-sign-state) ;
980
981
982 : (rcdata-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
983     {
984         { [ dup ascii-alpha? ] [ reach new-end-tag (rcdata-end-tag-name-state) ] }
985         [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
986     } cond ;
987
988 : rcdata-end-tag-open-state ( document n/f string -- document n'/f string )
989     take-char (rcdata-end-tag-open-state) ;
990
991
992 : (rcdata-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
993     {
994         { [ dup "\t\n\f\s" member? ] [
995             drop pick appropriate-end-tag-token?
996             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
997         ] }
998         { [ dup CHAR: / = ] [
999             drop pick appropriate-end-tag-token?
1000             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
1001         ] }
1002         { [ dup CHAR: > = ] [
1003             drop pick appropriate-end-tag-token?
1004             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
1005         ] }
1006         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
1007         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
1008         [ [ "</" reach emit-temporary-buffer-with ] dip (rcdata-state) ]
1009     } cond ;
1010
1011 : rcdata-end-tag-name-state ( document n/f string -- document n'/f string )
1012     take-char (rcdata-end-tag-name-state) ;
1013
1014
1015 : (rawtext-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1016     {
1017         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rawtext-end-tag-open-state ] }
1018         [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
1019     } cond ;
1020
1021 : rawtext-less-than-sign-state ( document n/f string -- document n'/f string )
1022     take-char (rawtext-less-than-sign-state) ;
1023
1024
1025 : (rawtext-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1026     {
1027         { [ dup ascii-alpha? ] [ reach new-end-tag (rawtext-end-tag-name-state) ] }
1028         [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
1029     } cond ;
1030
1031 : rawtext-end-tag-open-state ( document n/f string -- document n'/f string )
1032     take-char (rawtext-end-tag-open-state) ;
1033
1034
1035 : (rawtext-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1036     {
1037         { [ dup "\t\n\f\s" member? ] [
1038             drop pick appropriate-end-tag-token?
1039             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1040         ] }
1041         { [ dup CHAR: / = ] [
1042             drop pick appropriate-end-tag-token?
1043             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1044         ] }
1045         { [ dup CHAR: > = ] [
1046             drop pick appropriate-end-tag-token?
1047             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
1048         ] }
1049         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1050         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1051         [ [ "</" reach emit-temporary-buffer-with ] dip (rawtext-state) ]
1052     } cond ;
1053
1054 : rawtext-end-tag-name-state ( document n/f string -- document n'/f string )
1055     take-char (rawtext-end-tag-name-state) ;
1056
1057
1058 : (script-data-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1059     {
1060         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-end-tag-open-state ] }
1061         { [ dup CHAR: ! = ] [ drop "<!" reach emit-string script-data-escape-start-state ] }
1062         [ [ CHAR: < reach emit-char ] dip (script-data-state) ]
1063     } cond ;
1064
1065 : script-data-less-than-sign-state ( document n/f string -- document n'/f string )
1066     take-char (script-data-less-than-sign-state) ;
1067
1068
1069 : (script-data-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1070     {
1071         { [ dup ascii-alpha? ] [ reach new-end-tag (script-data-end-tag-name-state) ] }
1072         [ [ "</" reach emit-string ] dip (script-data-state) ]
1073     } cond ;
1074
1075 : script-data-end-tag-open-state ( document n/f string -- document n'/f string )
1076     take-char (script-data-end-tag-open-state) ;
1077
1078
1079 : (script-data-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1080     {
1081         { [ dup "\t\n\f\s" member? ] [
1082             drop pick appropriate-end-tag-token?
1083             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1084         ] }
1085         { [ dup CHAR: / = ] [
1086             drop pick appropriate-end-tag-token?
1087             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1088         ] }
1089         { [ dup CHAR: > = ] [
1090             drop pick appropriate-end-tag-token?
1091             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
1092         ] }
1093         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1094         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
1095         [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-state) ]
1096     } cond ;
1097
1098 : script-data-end-tag-name-state ( document n/f string -- document n'/f string )
1099     take-char (script-data-end-tag-name-state) ;
1100
1101
1102 : (script-data-escape-start-state) ( document n/f string ch/f -- document n'/f string )
1103     {
1104         { [ dup CHAR: - = ] [ drop script-data-escape-start-dash-state ] }
1105         [ (script-data-state) ]
1106     } cond ;
1107
1108 : script-data-escape-start-state ( document n/f string -- document n'/f string )
1109     take-char (script-data-escape-start-state) ;
1110
1111
1112 : (script-data-escape-start-dash-state) ( document n/f string ch/f -- document n'/f string )
1113     {
1114         { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
1115         [ (script-data-state) ]
1116     } cond ;
1117
1118 : script-data-escape-start-dash-state ( document n/f string -- document n'/f string )
1119     take-char (script-data-escape-start-dash-state) ;
1120
1121
1122 : (script-data-escaped-state) ( document n/f string ch/f -- document n'/f string )
1123     {
1124         { [ dup CHAR: - = ] [ drop script-data-escaped-dash-state ] }
1125         { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1126         { [ dup CHAR: \0 = ] [ unexpected-null-character CHAR: replacement-character unimplemented* ] }
1127         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1128         [ reach emit-char script-data-escaped-state ]
1129     } cond ;
1130
1131 : script-data-escaped-state ( document n/f string -- document n'/f string )
1132     take-char (script-data-escaped-state) ;
1133
1134
1135 : (script-data-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
1136     {
1137         { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
1138         { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1139         { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
1140         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1141         [ reach emit-char script-data-escaped-state ]
1142     } cond ;
1143
1144 : script-data-escaped-dash-state ( document n/f string -- document n'/f string )
1145     take-char (script-data-escaped-dash-state) ;
1146
1147
1148 : (script-data-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1149     {
1150         { [ dup CHAR: - = ] [ reach emit-char script-data-escaped-dash-dash-state ] }
1151         { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
1152         { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
1153         { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
1154         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1155         [ reach emit-char script-data-escaped-state ]
1156     } cond ;
1157
1158 : script-data-escaped-dash-dash-state ( document n/f string -- document n'/f string )
1159     take-char (script-data-escaped-dash-dash-state) ;
1160
1161
1162 : (script-data-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1163     {
1164         { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-escaped-end-tag-open-state ] }
1165         { [ dup ascii-alpha? ] [ [ pick reset-temporary-buffer CHAR: < reach emit-char ] dip (script-data-double-escape-start-state) ] }
1166         [ [ CHAR: < reach emit-char ] dip (script-data-escaped-state) ]
1167     } cond ;
1168
1169 : script-data-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
1170     take-char (script-data-escaped-less-than-sign-state) ;
1171
1172
1173 : (script-data-escaped-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
1174     {
1175         { [ dup ascii-alpha? ] [ [ pick new-end-tag ] dip (script-data-escaped-end-tag-name-state) ] }
1176         [ [ "</" reach emit-string ] dip (script-data-escaped-state) ]
1177     } cond ;
1178
1179 : script-data-escaped-end-tag-open-state ( document n/f string -- document n'/f string )
1180     take-char (script-data-escaped-end-tag-open-state) ;
1181
1182
1183 : (script-data-escaped-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
1184     {
1185         { [ dup "\t\n\f\s" member? ] [
1186             drop pick appropriate-end-tag-token?
1187             [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1188         ] }
1189         { [ dup CHAR: / = ] [
1190             drop pick appropriate-end-tag-token?
1191             [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1192         ] }
1193         { [ dup CHAR: > = ] [
1194             drop pick appropriate-end-tag-token?
1195             [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
1196         ] }
1197         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
1198         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
1199         [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-escaped-state) ]
1200     } cond ;
1201
1202 : script-data-escaped-end-tag-name-state ( document n/f string -- document n'/f string )
1203     take-char (script-data-escaped-end-tag-name-state) ;
1204
1205
1206 : (script-data-double-escape-start-state) ( document n/f string ch/f -- document n'/f string )
1207     {
1208         { [ dup "\t\n\f\s/>" member? ] [
1209             reach emit-char
1210             pick temporary-buffer>> "script" sequence=
1211             [ script-data-double-escaped-state ] [ script-data-escaped-state ] if
1212         ] }
1213         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] }
1214         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] } ! todo
1215         [ (script-data-escaped-state) ]
1216     } cond ;
1217
1218 : script-data-double-escape-start-state ( document n/f string -- document n'/f string )
1219     take-char (script-data-double-escape-start-state) ;
1220
1221
1222 : (script-data-double-escaped-state) ( document n/f string ch/f -- document n'/f string )
1223     {
1224         { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-state ] }
1225         { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1226         { [ dup CHAR: \0 = ] [
1227             unexpected-null-character
1228             CHAR: replacement-character reach emit-char
1229             script-data-double-escaped-state
1230         ] }
1231         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1232         [ reach emit-char script-data-double-escaped-state ]
1233     } cond ;
1234
1235 : script-data-double-escaped-state ( document n/f string -- document n'/f string )
1236     take-char (script-data-double-escaped-state) ;
1237
1238
1239 : (script-data-double-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
1240     {
1241         { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
1242         { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1243         { [ dup CHAR: \0 = ] [
1244             unexpected-null-character
1245             CHAR: replacement-character reach emit-char
1246             script-data-double-escaped-state
1247         ] }
1248         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1249         [ reach emit-char script-data-double-escaped-state ]
1250     } cond ;
1251
1252 : script-data-double-escaped-dash-state ( document n/f string -- document n'/f string )
1253     take-char (script-data-double-escaped-dash-state) ;
1254
1255
1256 : (script-data-double-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1257     {
1258         { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
1259         { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
1260         { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
1261         { [ dup CHAR: \0 = ] [
1262             unexpected-null-character
1263             CHAR: replacement-character reach emit-char
1264             script-data-double-escaped-state
1265         ] }
1266         { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
1267         [ reach emit-char script-data-escaped-state ]
1268     } cond ;
1269
1270 : script-data-double-escaped-dash-dash-state ( document n/f string -- document n'/f string )
1271     take-char (script-data-double-escaped-dash-dash-state) ;
1272
1273
1274 : (script-data-double-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1275     {
1276         { [ dup CHAR: / = ] [ reach emit-char pick reset-temporary-buffer script-data-double-escape-end-state ] }
1277         [ (script-data-double-escaped-state) ]
1278     } cond ;
1279
1280 : script-data-double-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
1281     take-char (script-data-double-escaped-less-than-sign-state) ;
1282
1283
1284 : (script-data-double-escape-end-state) ( document n/f string ch/f -- document n'/f string )
1285     {
1286         { [ dup "\t\n\f\s/>" member? ] [
1287             reach emit-char
1288             pick temporary-buffer>> "script" sequence=
1289             [ script-data-escaped-state ] [ script-data-double-escaped-state ] if
1290         ] }
1291         { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] }
1292         { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] } ! todo
1293         [ (script-data-double-escaped-state) ]
1294     } cond ;
1295
1296 : script-data-double-escape-end-state ( document n/f string -- document n'/f string )
1297     take-char (script-data-double-escape-end-state) ;
1298
1299
1300 : (before-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1301     {
1302         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1303         { [ dup "/>" member? ] [ (after-attribute-name-state) ] }
1304         { [ dup f = ] [ (after-attribute-name-state) ] }
1305         { [ dup CHAR: = = ] [ unexpected-equals-sign-before-attribute-name ] }
1306         [ reach push-attribute (attribute-name-state) ]
1307     } cond ;
1308
1309 : before-attribute-name-state ( document n/f string -- document n'/f string )
1310     take-char (before-attribute-name-state) ;
1311
1312
1313 : (attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1314     {
1315         { [ dup "\t\n\f\s/>" member? ] [ (after-attribute-name-state) ] }
1316         { [ dup f = ] [ (after-attribute-name-state) ] }
1317         { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
1318         { [ dup ascii-upper-alpha? ] [
1319             0x20 + reach push-attribute-name
1320             attribute-name-state
1321         ] }
1322         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
1323         { [ dup "\"'<" member? ] [
1324             unexpected-character-in-attribute-name
1325             reach push-attribute-name attribute-name-state
1326         ] }
1327         [ reach push-attribute-name attribute-name-state ]
1328     } cond ;
1329
1330 : attribute-name-state ( document n/f string -- document n'/f string )
1331     take-char (attribute-name-state) ;
1332
1333
1334 : (after-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
1335     {
1336         { [ dup "\t\n\f\s" member? ] [ drop after-attribute-name-state ] }
1337         { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
1338         { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
1339         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1340         { [ dup f = ] [ eof-in-tag ] }
1341         [ [ pick push-attribute ] dip (attribute-name-state) ]
1342     } cond ;
1343
1344 : after-attribute-name-state ( document n/f string -- document n'/f string )
1345     take-char (after-attribute-name-state) ;
1346
1347
1348 : (before-attribute-value-state) ( document n/f string ch/f -- document n'/f string )
1349     {
1350         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1351         { [ dup CHAR: " = ] [ drop attribute-value-double-quoted-state ] }
1352         { [ dup CHAR: ' = ] [ drop attribute-value-single-quoted-state ] }
1353         { [ dup CHAR: > = ] [ drop missing-attribute-value ] }
1354         [ (attribute-value-unquoted-state) ]
1355     } cond ;
1356
1357 : before-attribute-value-state ( document n/f string -- document n'/f string )
1358     take-char (before-attribute-value-state) ;
1359
1360
1361 : (attribute-value-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1362     {
1363         { [ dup CHAR: " = ] [ drop after-attribute-value-quoted-state ] }
1364         { [ dup CHAR: & = ] [
1365             drop
1366             [ \ attribute-value-double-quoted-state >>return-state ] 2dip character-reference-state
1367         ] }
1368         { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
1369         { [ dup f = ] [ eof-in-tag ] }
1370         [ reach push-attribute-value attribute-value-double-quoted-state ]
1371     } cond ;
1372
1373 : attribute-value-double-quoted-state ( document n/f string -- document n'/f string )
1374     take-char (attribute-value-double-quoted-state) ;
1375
1376
1377 : (attribute-value-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1378     {
1379         { [ dup CHAR: ' = ] [ drop after-attribute-value-quoted-state ] }
1380         { [ dup CHAR: & = ] [
1381             drop [ \ attribute-value-single-quoted-state >>return-state ] 2dip
1382             character-reference-state
1383         ] }
1384         { [ dup CHAR: \0 = ] [
1385             drop unexpected-null-character
1386             CHAR: replacement-character reach push-attribute-value
1387         ] }
1388         { [ dup f = ] [ eof-in-tag ] }
1389         [ reach push-attribute-value attribute-value-single-quoted-state ]
1390     } cond ;
1391
1392 : attribute-value-single-quoted-state ( document n/f string -- document n'/f string )
1393     take-char (attribute-value-single-quoted-state) ;
1394
1395
1396 : (attribute-value-unquoted-state) ( document n/f string ch/f -- document n'/f string )
1397     {
1398         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1399         { [ dup CHAR: & = ] [
1400             drop
1401             [ \ attribute-value-unquoted-state >>return-state ] 2dip character-reference-state
1402         ] }
1403         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1404         { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-attribute-value ] }
1405         { [ dup "\"'<=`" member? ] [
1406             unexpected-character-in-unquoted-attribute-value
1407             reach push-attribute-value
1408             attribute-value-unquoted-state
1409         ] }
1410         { [ dup f = ] [ eof-in-tag ] }
1411         [ reach push-attribute-value attribute-value-unquoted-state ]
1412     } cond ;
1413
1414 : attribute-value-unquoted-state ( document n/f string -- document n'/f string )
1415     take-char (attribute-value-unquoted-state) ;
1416
1417
1418 : (after-attribute-value-quoted-state) ( document n/f string ch/f -- document n'/f string )
1419     {
1420         { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
1421         { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
1422         { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
1423         { [ dup f = ] [ eof-in-tag ] }
1424         [ missing-whitespace-between-attributes (before-attribute-name-state) ]
1425     } cond ;
1426
1427 : after-attribute-value-quoted-state ( document n/f string -- document n'/f string )
1428     take-char (after-attribute-value-quoted-state) ;
1429
1430
1431 : (self-closing-start-tag-state) ( document n/f string ch/f -- document n'/f string )
1432     {
1433         { [ dup CHAR: > = ] [ drop pick [ set-self-closing ] [ emit-tag ] bi data-state ] }
1434         { [ dup f = ] [ eof-in-tag ] }
1435         [ unexpected-solidus-in-tag ]
1436     } cond ;
1437
1438 : self-closing-start-tag-state ( document n/f string -- document n'/f string )
1439     take-char (self-closing-start-tag-state) ;
1440
1441
1442 : (bogus-comment-state) ( document n/f string ch/f -- document n'/f string )
1443     {
1444         { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
1445         { [ dup f = ] [ drop pick [ emit-comment-token ] [ emit-eof ] bi ] }
1446         { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-comment-token ] }
1447         [ reach push-comment-token bogus-comment-state ]
1448     } cond ;
1449
1450 : bogus-comment-state ( document n/f string -- document n'/f string )
1451     take-char (bogus-comment-state) ;
1452
1453
1454 : markup-declaration-open-state ( document n/f string -- document n'/f string )
1455     {
1456         { [ "--" take-from? ] [ comment-start-state ] }
1457         { [ "DOCTYPE" take-from-insensitive? ] [ doctype-state ] }
1458         { [ "[CDATA[" take-from-insensitive? ] [ unimplemented* ] }
1459         [
1460             incorrectly-opened-comment ! bogus-comment-state
1461         ]
1462     } cond ;
1463
1464 : (comment-start-state) ( document n/f string ch/f -- document n'/f string )
1465     {
1466         { [ dup CHAR: - = ] [ drop comment-start-dash-state ] }
1467         { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment pick emit-comment-token data-state ] }
1468         [ (comment-state) ]
1469     } cond ;
1470
1471 : comment-start-state ( document n/f string -- document n'/f string )
1472     take-char (comment-start-state) ;
1473
1474
1475 : (comment-start-dash-state) ( document n/f string ch/f -- document n'/f string )
1476     {
1477         { [ dup CHAR: - = ] [ drop comment-end-state ] }
1478         { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment ] }
1479         { [ dup f = ] [ eof-in-comment ] }
1480         [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
1481     } cond ;
1482
1483 : comment-start-dash-state ( document n/f string -- document n'/f string )
1484     take-char (comment-start-dash-state) ;
1485
1486
1487 : (comment-state) ( document n/f string ch/f -- document n'/f string )
1488     {
1489         { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
1490         { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
1491         { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
1492         { [ dup f = ] [ eof-in-comment ] }
1493         [ reach push-comment-token comment-state ]
1494     } cond ;
1495
1496 : comment-state ( document n/f string -- document n'/f string )
1497     take-char (comment-state) ;
1498
1499
1500 : (comment-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
1501     {
1502         { [ dup CHAR: ! = ] [ reach push-comment-token comment-less-than-sign-bang-state ] }
1503         { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
1504         [ (comment-state) ]
1505     } cond ;
1506
1507 : comment-less-than-sign-state ( document n/f string -- document n'/f string )
1508     take-char (comment-less-than-sign-state) ;
1509
1510
1511 : (comment-less-than-sign-bang-state) ( document n/f string ch/f -- document n'/f string )
1512     {
1513         { [ dup CHAR: - = ] [ reach push-comment-token comment-less-than-sign-bang-dash-state ] }
1514         [ (comment-state) ]
1515     } cond ;
1516
1517 : comment-less-than-sign-bang-state ( document n/f string -- document n'/f string )
1518     take-char (comment-less-than-sign-bang-state) ;
1519
1520
1521 : (comment-less-than-sign-bang-dash-state) ( document n/f string ch/f -- document n'/f string )
1522     {
1523         { [ dup CHAR: - = ] [ drop comment-less-than-sign-bang-dash-dash-state ] }
1524         [ (comment-end-dash-state) ]
1525     } cond ;
1526
1527 : comment-less-than-sign-bang-dash-state ( document n/f string -- document n'/f string )
1528     take-char (comment-less-than-sign-bang-dash-state) ;
1529
1530
1531 : (comment-less-than-sign-bang-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
1532     {
1533         { [ dup CHAR: > = ] [ (comment-end-state) ] }
1534         { [ dup f = ] [ (comment-end-state) ] }
1535         [ nested-comment (comment-end-state) ]
1536     } cond ;
1537
1538 : comment-less-than-sign-bang-dash-dash-state ( document n/f string -- document n'/f string )
1539     take-char (comment-less-than-sign-bang-dash-dash-state) ;
1540
1541
1542 : (comment-end-dash-state) ( document n/f string ch/f -- document n'/f string )
1543     {
1544         { [ dup CHAR: - = ] [ drop comment-end-state ] }
1545         { [ dup f = ] [ eof-in-comment ] }
1546         [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
1547     } cond ;
1548
1549 : comment-end-dash-state ( document n/f string -- document n'/f string )
1550     take-char (comment-end-dash-state) ;
1551
1552
1553 : (comment-end-state) ( document n/f string ch/f -- document n'/f string )
1554     {
1555         { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
1556         { [ dup CHAR: ! = ] [ drop comment-end-bang-state ] }
1557         { [ dup CHAR: - = ] [ reach push-comment-token comment-end-state ] }
1558         { [ dup f = ] [ drop eof-in-comment pick [ emit-comment-token ] [ emit-eof ] bi ] }
1559         [ [ "--" reach push-all-comment-token ] dip (comment-state) ]
1560     } cond ;
1561
1562 : comment-end-state ( document n/f string -- document n'/f string )
1563     take-char (comment-end-state) ;
1564
1565
1566 : (comment-end-bang-state) ( document n/f string ch/f -- document n'/f string )
1567     {
1568         { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
1569         { [ dup CHAR: > = ] [ drop incorrectly-closed-comment pick emit-comment-token data-state ] }
1570         { [ dup f = ] [ eof-in-comment ] }
1571         [ [ "--!" reach push-all-comment-token ] dip (comment-state) ]
1572     } cond ;
1573
1574 : comment-end-bang-state ( document n/f string -- document n'/f string )
1575     take-char (comment-end-bang-state) ;
1576
1577
1578 : (doctype-state) ( document n/f string ch/f -- document n'/f string )
1579     {
1580         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
1581         { [ dup CHAR: > = ] [ (before-doctype-name-state) ] }
1582         { [ dup f = ] [ drop eof-in-doctype pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1583         [ missing-whitespace-before-doctype-name ]
1584     } cond ;
1585
1586 : doctype-state ( document n/f string -- document n'/f string )
1587     take-char (doctype-state) ;
1588
1589
1590 : (before-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1591     {
1592         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
1593         { [ dup ascii-upper-alpha? ] [ 0x20 + reach new-doctype-from-ch doctype-name-state ] }
1594         { [ dup CHAR: \0 = ] [
1595             drop
1596             unexpected-null-character
1597             CHAR: replacement-character reach new-doctype-from-ch
1598             doctype-name-state
1599         ] }
1600         { [ dup CHAR: > = ] [
1601             drop missing-doctype-name
1602             pick [ new-doctype-with-quirks ] [ emit-doctype ] bi
1603         ] }
1604         { [ dup f = ] [
1605             drop eof-in-doctype
1606             pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri
1607         ] }
1608         [ reach new-doctype-from-ch doctype-name-state ]
1609     } cond ;
1610
1611 : before-doctype-name-state ( document n/f string -- document n'/f string )
1612     take-char (before-doctype-name-state) ;
1613
1614
1615 : (doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1616     {
1617         { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
1618         { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1619         { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-doctype-name doctype-name-state ] }
1620         { [ dup CHAR: \0 = ] [
1621             drop unexpected-null-character
1622             CHAR: replacement-character pick push-doctype-name
1623             doctype-name-state
1624         ] }
1625         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } ! force-quirks on for doctype
1626         [ reach push-doctype-name doctype-name-state ]
1627     } cond ;
1628
1629 : doctype-name-state ( document n/f string -- document n'/f string )
1630     take-char (doctype-name-state) ;
1631
1632
1633 : (after-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
1634     {
1635         { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
1636         { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1637         { [ dup f = ] [ eof-in-doctype ] }
1638         { [ [ "PUBLIC" take-from-insensitive? ] dip swap ] [ drop after-doctype-public-keyword-state ] }
1639         { [ [ "SYSTEM" take-from-insensitive? ] dip swap ] [ drop after-doctype-system-keyword-state ] }
1640         [ invalid-character-sequence-after-doctype-name ]
1641     } cond ;
1642
1643 : after-doctype-name-state ( document n/f string -- document n'/f string )
1644     take-char (after-doctype-name-state) ;
1645
1646
1647 : (after-doctype-public-keyword-state) ( document n/f string ch/f -- document n'/f string )
1648     {
1649         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
1650         { [ dup CHAR: " = ] [ missing-whitespace-after-doctype-public-keyword ] }
1651         { [ dup CHAR: ' = ] [ missing-whitespace-after-doctype-public-keyword ] }
1652         { [ dup CHAR: > = ] [ drop missing-doctype-public-identifier force-quirks data-state ] }
1653         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1654         [
1655             missing-quote-before-doctype-public-identifier
1656             [ reach force-quirks ] dip
1657             (bogus-doctype-state)
1658         ]
1659     } cond ;
1660
1661 : after-doctype-public-keyword-state ( document n/f string -- document n'/f string )
1662     take-char (after-doctype-public-keyword-state) ;
1663
1664
1665 : (before-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
1666     {
1667         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
1668         { [ dup CHAR: " = ] [
1669             drop pick initialize-doctype-public-identifier
1670             doctype-public-identifier-double-quoted-state
1671         ] }
1672         { [ dup CHAR: ' = ] [
1673             drop pick initialize-doctype-public-identifier
1674             doctype-public-identifier-single-quoted-state
1675         ] }
1676         { [ dup CHAR: > = ] [
1677             drop missing-doctype-public-identifier
1678             pick [ force-quirks ] [ emit-doctype ] bi
1679             data-state
1680         ] }
1681         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1682         [
1683             missing-quote-before-doctype-public-identifier
1684             [ reach force-quirks ] dip
1685             (bogus-doctype-state)
1686         ]
1687     } cond ;
1688
1689 : before-doctype-public-identifier-state ( document n/f string -- document n'/f string )
1690     take-char (before-doctype-public-identifier-state) ;
1691
1692
1693 : (doctype-public-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1694     {
1695         { [ dup CHAR: " = ] [ drop after-doctype-public-identifier-state ] }
1696         { [ dup CHAR: \0 = ] [
1697             drop
1698             unexpected-null-character
1699             CHAR: replacement-character pick push-doctype-public-identifier
1700             doctype-public-identifier-double-quoted-state
1701         ] }
1702         { [ dup CHAR: > = ] [
1703             drop abrupt-doctype-public-identifier
1704             pick [ force-quirks ] [ emit-doctype ] bi
1705             data-state
1706         ] }
1707         { [ dup f = ] [
1708             drop eof-in-doctype
1709             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1710         ] }
1711         [ reach push-doctype-public-identifier doctype-public-identifier-double-quoted-state ]
1712     } cond ;
1713
1714 : doctype-public-identifier-double-quoted-state ( document n/f string -- document n'/f string )
1715     take-char (doctype-public-identifier-double-quoted-state) ;
1716
1717
1718 : (doctype-public-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1719     {
1720         { [ dup CHAR: ' = ] [ drop after-doctype-public-identifier-state ] }
1721         { [ dup CHAR: \0 = ] [
1722             drop
1723             unexpected-null-character
1724             CHAR: replacement-character pick push-doctype-public-identifier
1725             doctype-public-identifier-double-quoted-state
1726         ] }
1727         { [ dup CHAR: > = ] [
1728             drop abrupt-doctype-public-identifier
1729             pick [ force-quirks ] [ emit-doctype ] bi
1730             data-state
1731         ] }
1732         { [ dup f = ] [
1733             drop eof-in-doctype
1734             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1735         ] }
1736         [ reach push-doctype-public-identifier doctype-public-identifier-single-quoted-state ]
1737     } cond ;
1738
1739 : doctype-public-identifier-single-quoted-state ( document n/f string -- document n'/f string )
1740     take-char (doctype-public-identifier-single-quoted-state) ;
1741
1742
1743 : (after-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
1744     {
1745         { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1746         { [ dup CHAR: > = ] [
1747             drop pick emit-doctype
1748             data-state
1749         ] }
1750         { [ dup CHAR: " = ] [
1751             drop missing-whitespace-between-doctype-public-and-system-identifiers
1752             pick initialize-doctype-system-identifier
1753             doctype-system-identifier-double-quoted-state
1754         ] }
1755         { [ dup CHAR: ' = ] [
1756             drop missing-whitespace-between-doctype-public-and-system-identifiers
1757             pick initialize-doctype-system-identifier
1758             doctype-system-identifier-single-quoted-state
1759         ] }
1760         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1761         [
1762             missing-quote-before-doctype-system-identifier
1763             [ reach force-quirks ] dip
1764             (bogus-doctype-state)
1765         ]
1766     } cond ;
1767
1768 : after-doctype-public-identifier-state ( document n/f string -- document n'/f string )
1769     take-char (after-doctype-public-identifier-state) ;
1770
1771
1772 : (between-doctype-public-and-system-identifiers-state) ( document n/f string ch/f -- document n'/f string )
1773     {
1774         { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1775         { [ dup CHAR: > = ] [
1776             drop pick emit-doctype
1777             data-state
1778         ] }
1779         { [ dup CHAR: " = ] [
1780             drop pick initialize-doctype-system-identifier
1781             doctype-system-identifier-double-quoted-state
1782         ] }
1783         { [ dup CHAR: ' = ] [
1784             drop pick initialize-doctype-system-identifier
1785             doctype-system-identifier-single-quoted-state
1786         ] }
1787         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1788         [
1789             missing-quote-before-doctype-system-identifier
1790             [ reach force-quirks ] dip
1791             (bogus-doctype-state)
1792         ]
1793     } cond ;
1794
1795 : between-doctype-public-and-system-identifiers-state ( document n/f string -- document n'/f string )
1796     take-char (between-doctype-public-and-system-identifiers-state) ;
1797
1798
1799 : (after-doctype-system-keyword-state) ( document n/f string ch/f -- document n'/f string )
1800     {
1801         { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
1802         { [ dup CHAR: " = ] [
1803             drop missing-whitespace-after-doctype-system-keyword
1804             pick initialize-doctype-system-identifier
1805             doctype-system-identifier-double-quoted-state
1806         ] }
1807         { [ dup CHAR: ' = ] [
1808             drop missing-whitespace-after-doctype-system-keyword
1809             pick initialize-doctype-system-identifier
1810             doctype-system-identifier-single-quoted-state
1811         ] }
1812         { [ dup CHAR: > = ] [
1813             drop missing-doctype-system-identifier
1814             pick [ force-quirks ] [ emit-doctype ] bi
1815             data-state
1816         ] }
1817         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1818         [
1819             missing-quote-before-doctype-system-identifier
1820             [ reach force-quirks ] dip
1821             (bogus-doctype-state)
1822         ]
1823     } cond ;
1824
1825 : after-doctype-system-keyword-state ( document n/f string -- document n'/f string )
1826     take-char (after-doctype-system-keyword-state) ;
1827
1828
1829 : (before-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
1830     {
1831         { [ dup "\t\n\f\s" member? ] [ drop before-doctype-system-identifier-state ] }
1832         { [ dup CHAR: " = ] [
1833             drop pick initialize-doctype-system-identifier
1834             doctype-system-identifier-double-quoted-state
1835         ] }
1836         { [ dup CHAR: ' = ] [
1837             drop pick initialize-doctype-system-identifier
1838             doctype-system-identifier-single-quoted-state
1839         ] }
1840         { [ dup CHAR: > = ] [
1841             drop missing-doctype-system-identifier
1842             pick [ force-quirks ] [ emit-doctype ] bi
1843             data-state
1844         ] }
1845         { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
1846         [
1847             missing-quote-before-doctype-system-identifier
1848             [ reach force-quirks ] dip
1849             (bogus-doctype-state)
1850         ]
1851     } cond ;
1852
1853 : before-doctype-system-identifier-state ( document n/f string -- document n'/f string )
1854     take-char (before-doctype-system-identifier-state) ;
1855
1856
1857 : (doctype-system-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
1858     {
1859         { [ dup CHAR: " = ] [ drop after-doctype-system-identifier-state ] }
1860         { [ dup CHAR: \0 = ] [
1861             drop
1862             unexpected-null-character
1863             CHAR: replacement-character pick push-doctype-system-identifier
1864             doctype-system-identifier-double-quoted-state
1865         ] }
1866         { [ dup CHAR: > = ] [
1867             drop abrupt-doctype-system-identifier
1868             pick [ force-quirks ] [ emit-doctype ] bi
1869             data-state
1870         ] }
1871         { [ dup f = ] [
1872             drop eof-in-doctype
1873             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1874         ] }
1875         [ reach push-doctype-system-identifier doctype-system-identifier-double-quoted-state ]
1876     } cond ;
1877
1878 : doctype-system-identifier-double-quoted-state ( document n/f string -- document n'/f string )
1879     take-char (doctype-system-identifier-double-quoted-state) ;
1880
1881
1882 : (doctype-system-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
1883     {
1884         { [ dup CHAR: ' = ] [ drop after-doctype-system-identifier-state ] }
1885         { [ dup CHAR: \0 = ] [
1886             drop
1887             unexpected-null-character
1888             CHAR: replacement-character pick push-doctype-system-identifier
1889             doctype-system-identifier-double-quoted-state
1890         ] }
1891         { [ dup CHAR: > = ] [
1892             drop abrupt-doctype-system-identifier
1893             pick [ force-quirks ] [ emit-doctype ] bi
1894             data-state
1895         ] }
1896         { [ dup f = ] [
1897             drop eof-in-doctype
1898             pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
1899         ] }
1900         [ reach push-doctype-system-identifier doctype-system-identifier-single-quoted-state ]
1901     } cond ;
1902
1903 : doctype-system-identifier-single-quoted-state ( document n/f string -- document n'/f string )
1904     take-char (doctype-system-identifier-single-quoted-state) ;
1905
1906
1907 : (after-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
1908     {
1909         { [ dup "\t\n\f\s" member? ] [ drop after-doctype-system-identifier-state ] }
1910         { [ dup CHAR: > = ] [
1911             drop pick emit-doctype
1912             data-state
1913         ] }
1914         { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
1915         [
1916             unexpected-character-after-doctype-system-identifier
1917             [ reach force-quirks ] dip
1918             (bogus-doctype-state)
1919         ]
1920     } cond ;
1921
1922 : after-doctype-system-identifier-state ( document n/f string -- document n'/f string )
1923     take-char (after-doctype-system-identifier-state) ;
1924
1925
1926 : (bogus-doctype-state) ( document n/f string ch/f -- document n'/f string )
1927     {
1928         { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
1929         { [ dup CHAR: \0 = ] [ drop unexpected-null-character bogus-doctype-state ] }
1930         { [ dup f = ] [ drop eof-in-doctype pick emit-eof ] }
1931         [ drop bogus-doctype-state ]
1932     } cond ;
1933
1934 : bogus-doctype-state ( document n/f string -- document n'/f string )
1935     take-char (bogus-doctype-state) ;
1936
1937
1938 : (cdata-section-state) ( document n/f string ch/f -- document n'/f string )
1939     {
1940         { [ dup CHAR: ] = ] [ drop cdata-section-bracket-state ] }
1941         { [ dup f = ] [ drop eof-in-cdata pick emit-eof ] }
1942         [ reach emit-char cdata-section-state ]
1943     } cond ;
1944
1945 : cdata-section-state ( document n/f string -- document n'/f string )
1946     take-char (cdata-section-state) ;
1947
1948
1949 : (cdata-section-bracket-state) ( document n/f string ch/f -- document n'/f string )
1950     {
1951         { [ dup CHAR: ] = ] [ drop cdata-section-end-state ] }
1952         [ [ CHAR: ] reach emit-char ] dip (cdata-section-state) ]
1953     } cond ;
1954
1955 : cdata-section-bracket-state ( document n/f string -- document n'/f string )
1956     take-char (cdata-section-bracket-state) ;
1957
1958
1959 : (cdata-section-end-state) ( document n/f string ch/f -- document n'/f string )
1960     {
1961         { [ dup CHAR: ] = ] [ reach emit-char cdata-section-end-state ] }
1962         { [ dup CHAR: > = ] [ drop data-state ] }
1963         [ [ "]]" reach emit-string ] dip (cdata-section-state) ]
1964     } cond ;
1965
1966 : cdata-section-end-state ( document n/f string -- document n'/f string )
1967     take-char (cdata-section-end-state) ;
1968
1969
1970 : (character-reference-state) ( document n/f string ch/f -- document n'/f string )
1971     [ CHAR: & reach ch>new-temporary-buffer ] dip
1972     {
1973         { [ dup ascii-alphanumeric? ] [ (named-character-reference-state) ] }
1974         { [ dup CHAR: # = ] [ reach push-temporary-buffer numeric-character-reference-state ] }
1975         [ reach flush-temporary-buffer (return-state) ]
1976     } cond ;
1977
1978 : character-reference-state ( document n/f string -- document n'/f string )
1979     take-char (character-reference-state) ;
1980
1981
1982 : (named-character-reference-state) ( document n/f string ch/f -- document n'/f string )
1983     reach push-temporary-buffer
1984     pick named-character-match?
1985     [
1986         drop ! exact match, drop prefix match
1987         ! XXX: check me
1988         {
1989             [ pick temporary-buffer-attribute? ]
1990             [ pick temporary-buffer>> ?last CHAR: ; = not ]
1991             [ 3dup peek-from { [ CHAR: = = ] [ ascii-alphanumeric? ] } 1|| ]
1992         } 0&& [
1993             unimplemented*
1994             flush-temporary-buffer
1995             return-state
1996         ] [
1997             pick [ take-named-character ] [ flush-temporary-buffer ] bi return-state
1998         ] if
1999     ] [
2000         ! prefix match?
2001         [ named-character-reference-state ]
2002         [ pick flush-temporary-buffer ambiguous-ampersand-state ] if
2003     ] if ;
2004
2005 : named-character-reference-state ( document n/f string -- document n'/f string )
2006     take-char (named-character-reference-state) ;
2007
2008
2009 : (ambiguous-ampersand-state) ( document n/f string ch/f -- document n'/f string )
2010     {
2011         { [ dup ascii-alphanumeric? ] [
2012             unimplemented*
2013         ] }
2014         { [ dup CHAR: ; = ] [ unknown-named-character-reference (return-state) ] }
2015         [ (return-state) ]
2016     } cond ;
2017
2018 : ambiguous-ampersand-state ( document n/f string -- document n'/f string )
2019     take-char (ambiguous-ampersand-state) ;
2020
2021
2022 : (numeric-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2023     {
2024         { [ dup "xX" member? ] [ reach push-temporary-buffer hexadecimal-character-reference-start-state ] }
2025         [ (decimal-character-reference-start-state) ]
2026     } cond ;
2027
2028 : numeric-character-reference-state ( document n/f string -- document n'/f string )
2029     take-char (numeric-character-reference-state) ;
2030
2031
2032 : (hexadecimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
2033     {
2034         { [ dup ascii-hex-digit? ] [ (hexadecimal-character-reference-state) ] }
2035         [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
2036     } cond ;
2037
2038 : hexadecimal-character-reference-start-state ( document n/f string -- document n'/f string )
2039     take-char (hexadecimal-character-reference-start-state) ;
2040
2041
2042 : (decimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
2043     {
2044         { [ dup ascii-digit? ] [ (decimal-character-reference-state) ] }
2045         [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
2046     } cond ;
2047
2048 : decimal-character-reference-start-state ( document n/f string -- document n'/f string )
2049     take-char (decimal-character-reference-start-state) ;
2050
2051
2052 : (hexadecimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2053     {
2054         { [ dup ascii-digit? ] [ unimplemented* ] }
2055         { [ dup ascii-upper-hex-digit? ] [ unimplemented* ] }
2056         { [ dup ascii-lower-hex-digit? ] [ unimplemented* ] }
2057         { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
2058         [ missing-semicolon-after-character-reference ]
2059     } cond ;
2060
2061 : hexadecimal-character-reference-state ( document n/f string -- document n'/f string )
2062     take-char (hexadecimal-character-reference-state) ;
2063
2064
2065 : (decimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
2066     {
2067         { [ dup ascii-digit? ] [ unimplemented* ] }
2068         { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
2069         [ missing-semicolon-after-character-reference ]
2070     } cond ;
2071
2072 : decimal-character-reference-state ( document n/f string -- document n'/f string )
2073     take-char (decimal-character-reference-state) ;
2074
2075
2076 : (numeric-character-reference-end-state) ( document n/f string ch/f -- document n'/f string )
2077     {
2078         [ missing-semicolon-after-character-reference ]
2079     } cond ;
2080
2081 : numeric-character-reference-end-state ( document n/f string -- document n'/f string )
2082     take-char (numeric-character-reference-end-state) ;
2083
2084
2085
2086 : parse-html5 ( string -- document )
2087     [ <document> 0 ] dip data-state 2drop ;