]> gitweb.factorcode.org Git - factor.git/blobdiff - extra/html5/html5.factor
html5: move to extra, use take-char
[factor.git] / extra / html5 / html5.factor
diff --git a/extra/html5/html5.factor b/extra/html5/html5.factor
new file mode 100644 (file)
index 0000000..435f4c5
--- /dev/null
@@ -0,0 +1,2087 @@
+! Copyright (C) 2020 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: accessors arrays assocs combinators
+combinators.short-circuit io io.encodings.utf8 io.files
+json.reader kernel math math.order memoize modern.slices
+prettyprint sequences sequences.extras strings suffix-arrays
+words ;
+IN: html5
+
+: 1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip over push ; inline
+: ?1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip [ over push ] when* ; inline
+
+! https://html.spec.whatwg.org/multipage/parsing.html#tokenization
+
+! https://infra.spec.whatwg.org/#namespaces
+CONSTANT: html-namespace "http://www.w3.org/1999/xhtml"
+CONSTANT: mathml-namespace "http://www.w3.org/1998/Math/MathML"
+CONSTANT: svg-namespace "http://www.w3.org/2000/svg"
+CONSTANT: xlink-namespace "http://www.w3.org/1999/xlink"
+CONSTANT: xml-namespace "http://www.w3.org/XML/1998/namespace"
+CONSTANT: xmlns-namespace "http://www.w3.org/2000/xmlns/"
+
+DEFER: data-state
+DEFER: (data-state)
+DEFER: rcdata-state
+DEFER: (rcdata-state)
+DEFER: rawtext-state
+DEFER: (rawtext-state)
+DEFER: script-data-state
+DEFER: (script-data-state)
+DEFER: plaintext-state
+DEFER: (plaintext-state)
+DEFER: tag-open-state
+DEFER: (tag-open-state)
+DEFER: end-tag-open-state
+DEFER: (end-tag-open-state)
+DEFER: tag-name-state
+DEFER: (tag-name-state)
+DEFER: rcdata-less-than-sign-state
+DEFER: (rcdata-less-than-sign-state)
+DEFER: rcdata-end-tag-open-state
+DEFER: (rcdata-end-tag-open-state)
+DEFER: rcdata-end-tag-name-state
+DEFER: (rcdata-end-tag-name-state)
+DEFER: rawtext-less-than-sign-state
+DEFER: (rawtext-less-than-sign-state)
+DEFER: rawtext-end-tag-open-state
+DEFER: (rawtext-end-tag-open-state)
+DEFER: rawtext-end-tag-name-state
+DEFER: (rawtext-end-tag-name-state)
+DEFER: script-data-less-than-sign-state
+DEFER: (script-data-less-than-sign-state)
+DEFER: script-data-end-tag-open-state
+DEFER: (script-data-end-tag-open-state)
+DEFER: script-data-end-tag-name-state
+DEFER: (script-data-end-tag-name-state)
+DEFER: script-data-escape-start-state
+DEFER: (script-data-escape-start-state)
+DEFER: script-data-escape-start-dash-state
+DEFER: (script-data-escape-start-dash-state)
+DEFER: script-data-escaped-state
+DEFER: (script-data-escaped-state)
+DEFER: script-data-escaped-dash-state
+DEFER: (script-data-escaped-dash-state)
+DEFER: script-data-escaped-dash-dash-state
+DEFER: (script-data-escaped-dash-dash-state)
+DEFER: script-data-escaped-less-than-sign-state
+DEFER: (script-data-escaped-less-than-sign-state)
+DEFER: script-data-escaped-end-tag-open-state
+DEFER: (script-data-escaped-end-tag-open-state)
+DEFER: script-data-escaped-end-tag-name-state
+DEFER: (script-data-escaped-end-tag-name-state)
+DEFER: script-data-double-escape-start-state
+DEFER: (script-data-double-escape-start-state)
+DEFER: script-data-double-escaped-state
+DEFER: (script-data-double-escaped-state)
+DEFER: script-data-double-escaped-dash-state
+DEFER: (script-data-double-escaped-dash-state)
+DEFER: script-data-double-escaped-dash-dash-state
+DEFER: (script-data-double-escaped-dash-dash-state)
+DEFER: script-data-double-escaped-less-than-sign-state
+DEFER: (script-data-double-escaped-less-than-sign-state)
+DEFER: script-data-double-escape-end-state
+DEFER: (script-data-double-escape-end-state)
+DEFER: before-attribute-name-state
+DEFER: (before-attribute-name-state)
+DEFER: attribute-name-state
+DEFER: (attribute-name-state)
+DEFER: after-attribute-name-state
+DEFER: (after-attribute-name-state)
+DEFER: before-attribute-value-state
+DEFER: (before-attribute-value-state)
+DEFER: attribute-value-double-quoted-state
+DEFER: (attribute-value-double-quoted-state)
+DEFER: attribute-value-single-quoted-state
+DEFER: (attribute-value-single-quoted-state)
+DEFER: attribute-value-unquoted-state
+DEFER: (attribute-value-unquoted-state)
+DEFER: after-attribute-value-quoted-state
+DEFER: (after-attribute-value-quoted-state)
+DEFER: self-closing-start-tag-state
+DEFER: (self-closing-start-tag-state)
+DEFER: bogus-comment-state
+DEFER: (bogus-comment-state)
+DEFER: markup-declaration-open-state
+DEFER: (markup-declaration-open-state)
+DEFER: comment-start-state
+DEFER: (comment-start-state)
+DEFER: comment-start-dash-state
+DEFER: (comment-start-dash-state)
+DEFER: comment-state
+DEFER: (comment-state)
+DEFER: comment-less-than-sign-state
+DEFER: (comment-less-than-sign-state)
+DEFER: comment-less-than-sign-bang-state
+DEFER: (comment-less-than-sign-bang-state)
+DEFER: comment-less-than-sign-bang-dash-state
+DEFER: (comment-less-than-sign-bang-dash-state)
+DEFER: comment-less-than-sign-bang-dash-dash-state
+DEFER: (comment-less-than-sign-bang-dash-dash-state)
+DEFER: comment-end-dash-state
+DEFER: (comment-end-dash-state)
+DEFER: comment-end-state
+DEFER: (comment-end-state)
+DEFER: comment-end-bang-state
+DEFER: (comment-end-bang-state)
+DEFER: doctype-state
+DEFER: (doctype-state)
+DEFER: before-doctype-name-state
+DEFER: (before-doctype-name-state)
+DEFER: doctype-name-state
+DEFER: (doctype-name-state)
+DEFER: after-doctype-name-state
+DEFER: (after-doctype-name-state)
+DEFER: after-doctype-public-keyword-state
+DEFER: (after-doctype-public-keyword-state)
+DEFER: before-doctype-public-identifier-state
+DEFER: (before-doctype-public-identifier-state)
+DEFER: doctype-public-identifier-double-quoted-state
+DEFER: (doctype-public-identifier-double-quoted-state)
+DEFER: doctype-public-identifier-single-quoted-state
+DEFER: (doctype-public-identifier-single-quoted-state)
+DEFER: after-doctype-public-identifier-state
+DEFER: (after-doctype-public-identifier-state)
+DEFER: between-doctype-public-and-system-identifiers-state
+DEFER: (between-doctype-public-and-system-identifiers-state)
+DEFER: after-doctype-system-keyword-state
+DEFER: (after-doctype-system-keyword-state)
+DEFER: before-doctype-system-identifier-state
+DEFER: (before-doctype-system-identifier-state)
+DEFER: doctype-system-identifier-double-quoted-state
+DEFER: (doctype-system-identifier-double-quoted-state)
+DEFER: doctype-system-identifier-single-quoted-state
+DEFER: (doctype-system-identifier-single-quoted-state)
+DEFER: after-doctype-system-identifier-state
+DEFER: (after-doctype-system-identifier-state)
+DEFER: bogus-doctype-state
+DEFER: (bogus-doctype-state)
+DEFER: cdata-section-state
+DEFER: (cdata-section-state)
+DEFER: cdata-section-bracket-state
+DEFER: (cdata-section-bracket-state)
+DEFER: cdata-section-end-state
+DEFER: (cdata-section-end-state)
+DEFER: character-reference-state
+DEFER: (character-reference-state)
+DEFER: named-character-reference-state
+DEFER: (named-character-reference-state)
+DEFER: ambiguous-ampersand-state
+DEFER: (ambiguous-ampersand-state)
+DEFER: numeric-character-reference-state
+DEFER: (numeric-character-reference-state)
+DEFER: hexadecimal-character-reference-start-state
+DEFER: (hexadecimal-character-reference-start-state)
+DEFER: decimal-character-reference-start-state
+DEFER: (decimal-character-reference-start-state)
+DEFER: hexadecimal-character-reference-state
+DEFER: (hexadecimal-character-reference-state)
+DEFER: decimal-character-reference-state
+DEFER: (decimal-character-reference-state)
+DEFER: numeric-character-reference-end-state
+DEFER: (numeric-character-reference-end-state)
+
+
+ERROR: unimplemented string ;
+ERROR: unimplemented* ;
+
+! Errors: https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
+ERROR: abrupt-closing-of-empty-comment ;
+ERROR: abrupt-doctype-public-identifier ;
+ERROR: abrupt-doctype-system-identifier ;
+ERROR: absence-of-digits-in-numeric-character-reference ;
+ERROR: cdata-in-html-content ;
+ERROR: character-reference-outside-unicode-range ;
+ERROR: control-character-in-input-stream ;
+ERROR: control-character-reference ;
+ERROR: end-tag-with-attributes ;
+ERROR: duplicate-attribute ;
+ERROR: end-tag-with-trailing-solidus ;
+ERROR: eof-before-tag-name ;
+ERROR: eof-in-cdata ;
+ERROR: eof-in-comment ;
+ERROR: eof-in-doctype ;
+ERROR: eof-in-script-html-comment-like-text ;
+ERROR: eof-in-tag ;
+ERROR: incorrectly-closed-comment ;
+ERROR: incorrectly-opened-comment ;
+ERROR: invalid-character-sequence-after-doctype-name ;
+ERROR: invalid-first-character-of-tag-name ;
+ERROR: missing-attribute-value ;
+ERROR: missing-doctype-name ;
+ERROR: missing-doctype-public-identifier ;
+ERROR: missing-doctype-system-identifier ;
+ERROR: missing-end-tag-name ;
+ERROR: missing-quote-before-doctype-public-identifier ;
+
+ERROR: missing-quote-before-doctype-system-identifier ;
+ERROR: missing-semicolon-after-character-reference ;
+ERROR: missing-whitespace-after-doctype-public-keyword ;
+ERROR: missing-whitespace-after-doctype-system-keyword ;
+ERROR: missing-whitespace-before-doctype-name ;
+ERROR: missing-whitespace-between-attributes ;
+ERROR: missing-whitespace-between-doctype-public-and-system-identifiers ;
+ERROR: nested-comment ;
+ERROR: noncharacter-character-reference ;
+ERROR: noncharacter-in-input-stream ;
+ERROR: non-void-html-element-start-tag-with-trailing-solidus ;
+ERROR: null-character-reference ;
+ERROR: surrogate-character-reference ;
+ERROR: surrogate-in-input-stream ;
+ERROR: unexpected-character-after-doctype-system-identifier ;
+ERROR: unexpected-character-in-attribute-name ;
+ERROR: unexpected-character-in-unquoted-attribute-value ;
+ERROR: unexpected-equals-sign-before-attribute-name ;
+ERROR: unexpected-null-character ;
+ERROR: unexpected-question-mark-instead-of-tag-name ;
+ERROR: unexpected-solidus-in-tag ;
+ERROR: unknown-named-character-reference ;
+
+! Tree insertion modes
+SINGLETONS: initial-mode before-html-mode before-head-mode
+in-head-mode in-head-noscript-mode after-head-mode
+in-body-mode text-mode in-table-mode in-table-text-mode
+in-caption-mode in-column-group-mode in-table-body-mode
+in-row-mode in-cell-mode in-select-mode in-select-in-table-mode in-template-mode
+after-body-mode in-frameset-mode after-frameset-mode after-after-body-mode
+after-after-frameset-mode ;
+
+TUPLE: document
+quirks-mode?
+limited-quirks-mode?
+iframe-srcdoc?
+scripting? ! set in constructor
+frameset-ok? ! frameset-ok? but we want default to f
+fostering-parent?
+tree
+tree-doctype
+head-element-pointer ! set during insertion time
+parser-cannot-change-mode-flag
+insertion-mode
+original-insertion-mode
+last
+node
+context
+doctype
+tag
+end-tag
+
+tag-name
+end-tag-name
+attribute-name
+attribute-value
+temporary-buffer
+comment-token
+open-elements
+return-state ;
+
+! "reset the insertion mode appropriately"
+! : reset-insertion-mode ( document -- document )
+!     f >>last
+!     dup open-elements>> ?last >>node
+!     dup [ open-elements>> ?first ] [ node>> ] bi = [
+!         t >>last dup node>> >>context
+!     ] when
+!     dup node>> {
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [
+!             dup name>> >lower { "td" "th" } member?
+!             pick last>> f = and
+!         ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!         { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] }
+!     } cond
+!     ;
+
+: temporary-buffer-attribute? ( document -- ? )
+    return-state>>
+    {
+        attribute-value-unquoted-state
+        attribute-value-single-quoted-state
+        attribute-value-double-quoted-state
+    } member? ;
+
+! name, public/system identifier should not be empty strings
+! until the state machine demands it
+TUPLE: doctype
+    name
+    public-identifier
+    system-identifier
+    quirks? ;
+
+: <doctype> ( -- doctype )
+    doctype new ; inline
+
+: new-doctype-from-ch ( ch document -- )
+    [
+        doctype new
+            swap ?1sbuf >>name
+    ] dip doctype<< ; inline
+
+: new-doctype-with-quirks ( document -- )
+    <doctype> t >>quirks? >>doctype drop ;
+
+TUPLE: tag self-closing? name attributes children end-tag ;
+
+: <tag> ( -- tag )
+    tag new
+        SBUF" " clone >>name
+        V{ } clone >>attributes
+        V{ } clone >>children ;
+
+TUPLE: end-tag self-closing? name attributes ;
+
+: <end-tag> ( -- tag )
+    end-tag new
+        SBUF" " clone >>name
+        V{ } clone >>attributes ;
+
+: new-tag ( document -- )
+    <tag> >>tag drop ;
+
+: new-end-tag ( document -- )
+    <end-tag> >>tag drop ;
+
+: set-self-closing ( document -- )
+    tag>> t >>self-closing? drop ;
+
+: <document> ( -- document )
+    document new
+        V{ } clone >>tree
+        initial-mode >>insertion-mode
+        <doctype> >>doctype
+        t >>frameset-ok?
+        ! SBUF" " clone >>tag-name
+        SBUF" " clone >>attribute-name
+        SBUF" " clone >>attribute-value
+        SBUF" " clone >>temporary-buffer
+        SBUF" " clone >>comment-token
+        V{ } clone >>open-elements
+    ; inline
+
+TUPLE: comment open payload close ;
+
+: <comment> ( payload -- comment )
+    comment new
+        swap >>payload ; inline
+
+: force-quirks ( document -- )
+    doctype>> t >>quirks? drop ;
+
+: initialize-doctype-name ( document -- )
+    [ SBUF" " clone ] dip doctype>> name<< ;
+
+: initialize-doctype-public-identifier ( document -- )
+    [ SBUF" " clone ] dip doctype>> public-identifier<< ;
+
+: initialize-doctype-system-identifier ( document -- )
+    [ SBUF" " clone ] dip doctype>> system-identifier<< ;
+
+: push-doctype-name ( ch document -- )
+    doctype>> name>> push ;
+
+: push-doctype-public-identifier ( ch document -- )
+    doctype>> public-identifier>> push ;
+
+: push-doctype-system-identifier ( ch document -- )
+    doctype>> system-identifier>> push ;
+
+! XXX: not html5 spec, fix
+ERROR: unmatched-closing-tag-error stack tag ;
+
+: unclosed-tag? ( obj -- ? )
+    { [ tag? ] [ end-tag>> not ] } 1&& ; inline
+
+:: find-matching-tag ( name stack -- seq )
+    stack [ { [ unclosed-tag? ] [ name>> name = ] } 1&& ] find-last drop [
+        stack swap shorten*
+    ] [
+        stack name unmatched-closing-tag-error
+    ] if* ;
+
+DEFER: tree-insert
+GENERIC: tree-insert* ( document obj insertion-mode -- document )
+
+: limited-quirks-mode? ( doctype -- ? )
+    {
+        [ public-identifier>> "-//W3C//DTD XHTML 1.0 Frameset//" head? ]
+        [ public-identifier>> "-//W3C//DTD XHTML 1.0 Transitional//" head? ]
+        [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Frameset//" head? ] } 1&& ]
+        [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Transitional//" head? ] } 1&& ]
+    } 1|| ;
+
+! https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode
+M: initial-mode tree-insert*
+    drop {
+        { [ dup "\t\n\f\r\s" member? ] [ drop ] }
+        { [ dup doctype? ] [
+            >>tree-doctype before-html-mode >>insertion-mode
+        ] }
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup tag? ] [ over tree>> push ] }
+        { [ dup end-tag? ] [
+            dup name>> pick tree>> find-matching-tag
+            unclip
+                swap >>children
+                swap >>end-tag
+            over tree>> push
+        ] }
+        [
+            over iframe-srcdoc?>> [
+                over parser-cannot-change-mode-flag>> [
+                    [ t >>quirks-mode? ] dip
+                ] unless
+            ] [
+                "must be iframe-srcdoc here" throw
+            ] if
+            ! reprocess the token
+            before-html-mode >>insertion-mode tree-insert
+        ]
+    } cond ;
+
+! https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
+M: before-html-mode tree-insert*
+    drop {
+        { [ dup doctype? ] [ drop ] }
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup "\t\n\f\r\s" member? ] [ drop ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
+            over tree>> push
+            before-head-mode >>insertion-mode
+        ] }
+        ! these tags are handled in the default case
+        { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
+            ! error end-tag, ignore
+            drop
+        ] }
+        [
+            ! Create missing html tag and reprocess the token
+            <tag> "html" >>name pick tree>> push
+            before-head-mode >>insertion-mode tree-insert
+        ]
+    } cond ;
+
+M: before-head-mode tree-insert*
+    drop {
+        { [ dup "\t\n\f\r\s" member? ] [ drop ] }
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup doctype? ] [ drop ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
+            ! XXX: in-body-mode rules here for html tag
+            ! B
+            ! over tree>> push
+            ! before-head-mode >>insertion-mode
+            "handle html in-body-mode here" throw
+        ] }
+        { [ dup { [ tag? ] [ name>> "head" = ] } 1&& ] [
+            [ swap tree>> push ]
+            [ >>head-element-pointer drop ]
+            [ drop in-head-mode >>insertion-mode ] 2tri
+        ] }
+        ! these tags are handled in the default case
+        { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [
+            ! error end-tag, ignore
+            drop
+        ] }
+        ! ignore tag
+        { [ dup tag? ] [ drop ] }
+        [
+            ! Create missing html tag and reprocess the token
+            <tag>
+            [ "head" >>name pick tree>> push ]
+            [ >>head-element-pointer ] bi
+            in-head-mode >>insertion-mode tree-insert
+        ]
+    } cond ;
+
+M: in-head-mode tree-insert*
+    drop {
+        { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup doctype? ] [ drop ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
+            ! XXX: in-body-mode rules here for html tag
+            ! B
+            ! over tree>> push
+            ! before-head-mode >>insertion-mode
+            "handle html in-body-mode here" throw
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [ name>> { "base" "basefont" "bgsound" "link" } member? ] } 1&& ] [
+            ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [ name>> "meta" = ] } 1&& ] [
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [ name>> "title" = ] } 1&& ] [
+            ! https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm
+            "insert title node" throw
+            unimplemented*
+        ] }
+        { [
+            dup {
+                [ { [ tag? ] [ name>> "noscript" = ] [ scripting?>> ] } 1&& ]
+                [ { [ tag? ] [ name>> { "noframes" "style" } member? ] } 1&& ]
+            } 1||
+        ] [
+            ! https://html.spec.whatwg.org/multipage/parsing.html#generic-raw-text-element-parsing-algorithm
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [ name>> "noscript" = ] [ scripting?>> not ] } 1&& ] [
+            unimplemented*
+            over tree>> push
+            in-head-noscript-mode >>insertion-mode
+        ] }
+        { [ dup { [ tag? ] [ name>> "script" = ] } 1&& ] [
+            unimplemented*
+            text-mode >>insertion-mode
+        ] }
+        { [ dup { [ end-tag? ] [ name>> "head" = ] } 1&& ] [
+            over tree>> last end-tag<<
+            after-head-mode >>insertion-mode
+        ] }
+        { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? ] } 1&& ] [
+            ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [ name>> "template" = ] } 1&& ] [
+            unimplemented*
+            in-template-mode >>insertion-mode
+        ] }
+        { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
+            unimplemented*
+        ] }
+        ! XXX: revisit this
+        { [ dup {
+            [ { [ tag? ] [ name>> "head" = ] } 1&& ]
+            [ end-tag? ]
+            } 1|| ] [ drop "ignore here" throw ] }
+        [
+            ! end head tag should be here, pop off, reprocess
+            over tree>> pop swap >>end-tag
+            after-head-mode >>insertion-mode "omg" throw
+        ]
+    } cond ;
+
+M: in-head-noscript-mode tree-insert* drop unimplemented* ;
+
+M: after-head-mode tree-insert*
+    drop {
+        { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup doctype? ] [ drop ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
+            ! XXX: in-body-mode rules here for html tag
+            ! B
+            ! over tree>> push
+            ! before-head-mode >>insertion-mode
+            "handle html in-body-mode here" throw
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [
+            over tree>> push
+            f >>frameset-ok?
+            in-body-mode >>insertion-mode
+        ] }
+        { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [
+            unimplemented*
+        ] }
+        { [ dup { [ tag? ] [
+            name>> {
+                "base" "basefont" "bgsound" "link" "meta"
+                "noframes" "script" "style" "template" "title"
+            } member? ] } 1&&
+        ] [
+            unimplemented*
+        ] }
+        { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [
+            unimplemented*
+        ] }
+        ! same as default case
+        ! { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ] [
+        !     unimplemented*
+        ! ] }
+        { [
+            dup {
+                [ { [ tag? ] [ name>> "head" = ] } 1&& ]
+                [ { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ]
+            } 1||
+        ] [
+            "omg revisit this" throw
+            unimplemented*
+        ] }
+        [
+            B
+            <tag> "body" >>name pick tree>> push
+            in-body-mode >>insertion-mode tree-insert
+        ]
+    } cond ;
+
+M: in-body-mode tree-insert*
+    drop {
+        { [ dup CHAR: \0 = ] [ drop ] }
+        { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup doctype? ] [ drop ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ drop ] }
+        { [ dup {
+            [
+                {
+                    [ tag? ]
+                    [
+                        name>> {
+                            "base" "basefont" "bgsound" "link" "meta"
+                            "noframes" "script" "style" "template" "title"
+                        } member?
+                    ]
+                } 1&&
+            ] [
+                { [ end-tag? ] [ name>> "template" = ] } 1&&
+            ] } 1||
+        ] [
+            unimplemented*
+        ] }
+        ! XXX: parse error
+        { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [ drop unimplemented* ] }
+        { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [ drop unimplemented* ] }
+        ! XXX: eof
+        ! { [ ] [ ] }
+        { [ dup { [ end-tag? ] [ name>> "body" = ] } 1&& ] [
+            "body" pick tree>> find-matching-tag
+            unclip
+                swap >>children
+                swap >>end-tag
+            over tree>> push
+
+            after-body-mode >>insertion-mode
+         ] }
+        { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [ drop unimplemented* ] }
+        ! { [ ] [ ] }
+        [
+            unimplemented*
+        ]
+    } cond ;
+
+M: text-mode tree-insert* drop unimplemented* ;
+M: in-table-mode tree-insert* drop unimplemented* ;
+M: in-table-text-mode tree-insert* drop unimplemented* ;
+M: in-caption-mode tree-insert* drop unimplemented* ;
+M: in-column-group-mode tree-insert* drop unimplemented* ;
+M: in-table-body-mode tree-insert* drop unimplemented* ;
+M: in-row-mode tree-insert* drop unimplemented* ;
+M: in-cell-mode tree-insert* drop unimplemented* ;
+M: in-select-mode tree-insert* drop unimplemented* ;
+M: in-select-in-table-mode tree-insert* drop unimplemented* ;
+M: in-template-mode tree-insert* drop unimplemented* ;
+M: after-body-mode tree-insert*
+    drop {
+        { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] }
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup doctype? ] [ drop ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [
+            unimplemented*
+         ] }
+        { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [
+            ! XXX: make this a function
+            "html" pick tree>> find-matching-tag
+            unclip
+                swap >>children
+                swap >>end-tag
+            over tree>> push
+
+            after-after-body-mode >>insertion-mode
+        ] }
+        [
+            unimplemented*
+        ]
+    } cond ;
+M: in-frameset-mode tree-insert* drop unimplemented* ;
+M: after-frameset-mode tree-insert* drop unimplemented* ;
+
+M: after-after-body-mode tree-insert*
+    drop {
+        { [ dup comment? ] [ over tree>> push ] }
+        { [ dup doctype? ] [ unimplemented*  ] }
+        { [ dup "\t\n\f\r\s" member? ] [ unimplemented*  ] }
+        { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ unimplemented* ] }
+        ! eof
+        { [ dup f = ] [ drop ] }
+        [
+            ! XXX: parse error
+            [ in-body-mode >>insertion-mode ] dip tree-insert
+        ]
+    } cond ;
+
+M: after-after-frameset-mode tree-insert* drop unimplemented* ;
+
+: tree-insert ( document obj -- document )
+    over insertion-mode>> tree-insert* ;
+
+MEMO: load-entities ( -- assoc )
+    "vocab:html5/entities.json" utf8 file-contents json> ;
+
+MEMO: entities-suffix-array ( -- assoc )
+    load-entities keys >suffix-array ;
+
+: lookup-entity ( string -- entity/string ? )
+    load-entities ?at ;
+
+: named-character-match? ( document -- prefix? exact? )
+    temporary-buffer>>
+    [ entities-suffix-array query f like ]
+    [ last CHAR: ; = ] bi ;
+
+ERROR: unknown-named-entity entity ;
+: take-named-character ( document -- )
+    dup
+    temporary-buffer>> >string lookup-entity [
+        "characters" of
+        SBUF" " clone-like >>temporary-buffer drop
+    ] [
+        unknown-named-entity
+    ] if ;
+
+! XXX: remove the tag>> name>> push part
+: push-tag-name ( ch document -- )
+    [ tag>> name>> push ]
+    [
+        2drop ! tag-name>> push
+    ] 2bi ;
+: push-attribute-name ( ch document -- ) attribute-name>> push ;
+: push-attribute-value ( ch document -- ) attribute-value>> push ;
+: push-comment-token ( ch document -- ) comment-token>> push ;
+: push-all-comment-token ( string document -- ) comment-token>> push-all ;
+
+ERROR: invalid-return-state obj ;
+: check-return-state ( obj -- return-state )
+    dup word? [ invalid-return-state ] unless ;
+
+: current-attribute ( document -- attribute/f )
+    [ attribute-name>> >string f like ]
+    [ attribute-value>> >string f like ] bi
+    2dup or [ 2array ] [ 2drop f ] if ;
+
+: push-when ( obj/f seq -- )
+    over [ push ] [ 2drop ] if ; inline
+
+: reset-attribute ( document -- )
+    SBUF" " clone >>attribute-name
+    SBUF" " clone >>attribute-value drop ;
+
+: push-attribute ( document -- )
+    [ current-attribute ]
+    [ tag>> attributes>> push-when ]
+    [ reset-attribute ] tri ;
+
+: emit-eof ( document -- )
+    "emit-eof" print
+    f tree-insert drop ;
+: emit-char ( char document -- ) drop "emit-char: " write 1string . ;
+: emit-string ( char document -- ) drop "emit-string: " write . ;
+: emit-tag ( document -- )
+    "emit-tag: " write
+    {
+        [ tag>> [ name>> >string ] [ name<< ] bi ]
+        [ push-attribute ]
+        [ tag>> . ]
+        [ dup tag>> tree-insert drop ]
+        [ f >>tag drop ]
+    } cleave ;
+: emit-end-tag ( document -- )
+    "emit-end-tag: " write
+    [ tag>> . ]
+    [ f >>tag drop ] bi ;
+: emit-comment-token ( document -- )
+    "emit-comment-token: " write
+    {
+        [ comment-token>> >string . ]
+        [ dup comment-token>> >string <comment> tree-insert drop ]
+        [ SBUF" " clone >>comment-token drop ]
+    } cleave ;
+: emit-doctype ( document -- )
+    "emit-doctype: " write dup doctype>> .
+    {
+        [ doctype>> [ >string ] change-name drop ]
+        [
+            ! XXX: handle iframe srcdoc document
+            dup { [ doctype>> name>> "html" = not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
+                t >>quirks-mode?
+            ] [
+                dup { [ iframe-srcdoc?>> not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [
+                    dup doctype>> limited-quirks-mode? [ t >>limited-quirks-mode? ] when
+                ] when
+            ] if
+            drop
+        ]
+        [ dup doctype>> tree-insert drop ]
+        [ f >>doctype drop ]
+    } cleave ;
+
+: reset-temporary-buffer ( document -- ) SBUF" " clone temporary-buffer<< ;
+: ch>new-temporary-buffer ( ch document -- ) [ 1sbuf ] dip temporary-buffer<< ;
+: string>new-temporary-buffer ( string document -- ) [ SBUF" " clone-like ] dip temporary-buffer<< ;
+: temporary-buffer-last ( document -- ch/f ) temporary-buffer>> ?last ;
+: push-temporary-buffer ( ch document -- ) temporary-buffer>> push ;
+: push-all-temporary-buffer ( string document -- ) temporary-buffer>> push-all ;
+
+: flush-temporary-buffer ( document -- )
+    "flush-temporary-buffer: " write
+    [ [ temporary-buffer>> ] keep [ emit-char ] curry each ]
+    [ SBUF" " clone >>temporary-buffer drop ] bi ;
+
+: emit-temporary-buffer-with ( string document -- )
+    [ temporary-buffer>> push-all ]
+    [ flush-temporary-buffer ] bi ;
+
+! check if matches open tag
+: appropriate-end-tag-token? ( document -- ? )
+    drop f ;
+
+: ascii-upper-alpha? ( ch -- ? ) [ CHAR: A CHAR: Z between? ] [ f ] if* ; inline
+: ascii-lower-alpha? ( ch -- ? ) [ CHAR: a CHAR: z between? ] [ f ] if* ; inline
+: ascii-upper-hex-digit? ( ch -- ? ) [ CHAR: A CHAR: F between? ] [ f ] if* ; inline
+: ascii-lower-hex-digit? ( ch -- ? ) [ CHAR: a CHAR: f between? ] [ f ] if* ; inline
+: ascii-hex-alpha? ( ch -- ? ) { [ ascii-upper-hex-digit? ] [ ascii-lower-hex-digit? ] } 1|| ; inline
+
+: ascii-digit? ( ch/f -- ? ) [ CHAR: 0 CHAR: 9 between? ] [ f ] if* ;
+: ascii-alpha? ( ch/f -- ? ) { [ ascii-lower-alpha? ] [ ascii-upper-alpha? ] } 1|| ;
+: ascii-alphanumeric? ( ch/f -- ? ) { [ ascii-alpha? ] [ ascii-digit? ] } 1|| ;
+: ascii-hex-digit? ( ch/f -- ? ) { [ ascii-digit? ] [ ascii-hex-alpha? ] } 1|| ;
+
+: (return-state) ( document n/f string ch/f -- document n'/f string )
+    reach [ f ] change-return-state drop check-return-state
+    execute( document n/f string ch/f -- document n'/f string ) ;
+
+: return-state ( document n/f string -- document n'/f string )
+    pick [ f ] change-return-state drop check-return-state
+    execute( document n/f string -- document n'/f string ) ;
+
+: (data-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: & = ] [ drop [ \ data-state >>return-state ] 2dip character-reference-state ] }
+        { [ dup CHAR: < = ] [ drop tag-open-state ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
+        { [ dup f = ] [ drop pick emit-eof ] }
+        [ reach emit-char data-state ]
+    } cond ;
+
+: data-state ( document n/f string -- document n'/f string )
+    take-char (data-state) ;
+
+
+: (rcdata-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: & = ] [ drop [ \ rcdata-state >>return-state ] 2dip character-reference-state ] }
+        { [ dup CHAR: < = ] [ drop rcdata-less-than-sign-state ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
+        { [ dup f = ] [ drop pick emit-eof ] }
+        [ reach emit-char rcdata-state ]
+    } cond ;
+
+: rcdata-state ( document n/f string -- document n'/f string )
+    take-char (rcdata-state) ;
+
+
+: (rawtext-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: < = ] [ drop rawtext-less-than-sign-state ] }
+        { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
+        { [ dup f = ] [ drop pick emit-eof ] }
+        [ reach emit-char rawtext-state ]
+    } cond ;
+
+: rawtext-state ( document n/f string -- document n'/f string )
+    take-char (rawtext-state) ;
+
+
+: (script-data-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: < = ] [ drop script-data-less-than-sign-state ] }
+        { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
+        { [ dup f = ] [ drop pick emit-eof ] }
+        [ reach emit-char script-data-state ]
+    } cond ;
+
+: script-data-state ( document n/f string -- document n'/f string )
+    take-char (script-data-state) ;
+
+
+: (plaintext-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
+        { [ dup f = ] [ drop pick emit-eof ] }
+        [ reach emit-char plaintext-state ]
+    } cond ;
+
+: plaintext-state ( document n/f string -- document n'/f string )
+    take-char (plaintext-state) ;
+
+
+: (tag-open-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-alpha? ] [ reach new-tag (tag-name-state) ] }
+        { [ dup CHAR: ! = ] [ drop markup-declaration-open-state ] }
+        { [ dup CHAR: / = ] [ drop end-tag-open-state ] }
+        { [ dup CHAR: ? = ] [ unexpected-question-mark-instead-of-tag-name ] }
+        { [ dup f = ] [ eof-before-tag-name ] }
+        [ invalid-first-character-of-tag-name ]
+    } cond ;
+
+: tag-open-state ( document n/f string -- document n'/f string )
+    take-char (tag-open-state) ;
+
+
+: (end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-alpha? ] [ reach new-end-tag (tag-name-state) ] }
+        { [ dup CHAR: > = ] [ missing-end-tag-name ] }
+        { [ dup f = ] [ eof-before-tag-name ] }
+        [ invalid-first-character-of-tag-name ]
+    } cond ;
+
+: end-tag-open-state ( document n/f string -- document n'/f string )
+    take-char (end-tag-open-state) ;
+
+
+: (tag-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-tag-name tag-name-state ] }
+        { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
+        { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
+        { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
+        { [ dup f = ] [ eof-before-tag-name ] }
+        [ reach push-tag-name tag-name-state ]
+    } cond ;
+
+: tag-name-state ( document n/f string -- document n'/f string )
+    take-char (tag-name-state) ;
+
+
+: (rcdata-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rcdata-end-tag-open-state ] }
+        [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
+    } cond ;
+
+: rcdata-less-than-sign-state ( document n/f string -- document n'/f string )
+    take-char (rcdata-less-than-sign-state) ;
+
+
+: (rcdata-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-alpha? ] [ reach new-end-tag (rcdata-end-tag-name-state) ] }
+        [ [ CHAR: < reach emit-char ] dip (rcdata-state) ]
+    } cond ;
+
+: rcdata-end-tag-open-state ( document n/f string -- document n'/f string )
+    take-char (rcdata-end-tag-open-state) ;
+
+
+: (rcdata-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [
+            drop pick appropriate-end-tag-token?
+            [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
+        ] }
+        { [ dup CHAR: / = ] [
+            drop pick appropriate-end-tag-token?
+            [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
+        ] }
+        { [ dup CHAR: > = ] [
+            drop pick appropriate-end-tag-token?
+            [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rcdata-state ] if
+        ] }
+        { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
+        { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rcdata-end-tag-name-state ] }
+        [ [ "</" reach emit-temporary-buffer-with ] dip (rcdata-state) ]
+    } cond ;
+
+: rcdata-end-tag-name-state ( document n/f string -- document n'/f string )
+    take-char (rcdata-end-tag-name-state) ;
+
+
+: (rawtext-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rawtext-end-tag-open-state ] }
+        [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
+    } cond ;
+
+: rawtext-less-than-sign-state ( document n/f string -- document n'/f string )
+    take-char (rawtext-less-than-sign-state) ;
+
+
+: (rawtext-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-alpha? ] [ reach new-end-tag (rawtext-end-tag-name-state) ] }
+        [ [ CHAR: < reach emit-char ] dip (rawtext-state) ]
+    } cond ;
+
+: rawtext-end-tag-open-state ( document n/f string -- document n'/f string )
+    take-char (rawtext-end-tag-open-state) ;
+
+
+: (rawtext-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [
+            drop pick appropriate-end-tag-token?
+            [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
+        ] }
+        { [ dup CHAR: / = ] [
+            drop pick appropriate-end-tag-token?
+            [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
+        ] }
+        { [ dup CHAR: > = ] [
+            drop pick appropriate-end-tag-token?
+            [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with rawtext-state ] if
+        ] }
+        { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
+        { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
+        [ [ "</" reach emit-temporary-buffer-with ] dip (rawtext-state) ]
+    } cond ;
+
+: rawtext-end-tag-name-state ( document n/f string -- document n'/f string )
+    take-char (rawtext-end-tag-name-state) ;
+
+
+: (script-data-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-end-tag-open-state ] }
+        { [ dup CHAR: ! = ] [ drop "<!" reach emit-string script-data-escape-start-state ] }
+        [ [ CHAR: < reach emit-char ] dip (script-data-state) ]
+    } cond ;
+
+: script-data-less-than-sign-state ( document n/f string -- document n'/f string )
+    take-char (script-data-less-than-sign-state) ;
+
+
+: (script-data-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-alpha? ] [ reach new-end-tag (script-data-end-tag-name-state) ] }
+        [ [ "</" reach emit-string ] dip (script-data-state) ]
+    } cond ;
+
+: script-data-end-tag-open-state ( document n/f string -- document n'/f string )
+    take-char (script-data-end-tag-open-state) ;
+
+
+: (script-data-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [
+            drop pick appropriate-end-tag-token?
+            [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
+        ] }
+        { [ dup CHAR: / = ] [
+            drop pick appropriate-end-tag-token?
+            [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
+        ] }
+        { [ dup CHAR: > = ] [
+            drop pick appropriate-end-tag-token?
+            [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-state ] if
+        ] }
+        { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
+        { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi rawtext-end-tag-name-state ] }
+        [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-state) ]
+    } cond ;
+
+: script-data-end-tag-name-state ( document n/f string -- document n'/f string )
+    take-char (script-data-end-tag-name-state) ;
+
+
+: (script-data-escape-start-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop script-data-escape-start-dash-state ] }
+        [ (script-data-state) ]
+    } cond ;
+
+: script-data-escape-start-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escape-start-state) ;
+
+
+: (script-data-escape-start-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
+        [ (script-data-state) ]
+    } cond ;
+
+: script-data-escape-start-dash-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escape-start-dash-state) ;
+
+
+: (script-data-escaped-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop script-data-escaped-dash-state ] }
+        { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character CHAR: replacement-character unimplemented* ] }
+        { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
+        [ reach emit-char script-data-escaped-state ]
+    } cond ;
+
+: script-data-escaped-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escaped-state) ;
+
+
+: (script-data-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop script-data-escaped-dash-dash-state ] }
+        { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
+        { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
+        [ reach emit-char script-data-escaped-state ]
+    } cond ;
+
+: script-data-escaped-dash-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escaped-dash-state) ;
+
+
+: (script-data-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ reach emit-char script-data-escaped-dash-dash-state ] }
+        { [ dup CHAR: < = ] [ drop script-data-escaped-less-than-sign-state ] }
+        { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] }
+        { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
+        [ reach emit-char script-data-escaped-state ]
+    } cond ;
+
+: script-data-escaped-dash-dash-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escaped-dash-dash-state) ;
+
+
+: (script-data-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-escaped-end-tag-open-state ] }
+        { [ dup ascii-alpha? ] [ [ pick reset-temporary-buffer CHAR: < reach emit-char ] dip (script-data-double-escape-start-state) ] }
+        [ [ CHAR: < reach emit-char ] dip (script-data-escaped-state) ]
+    } cond ;
+
+: script-data-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escaped-less-than-sign-state) ;
+
+
+: (script-data-escaped-end-tag-open-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-alpha? ] [ [ pick new-end-tag ] dip (script-data-escaped-end-tag-name-state) ] }
+        [ [ "</" reach emit-string ] dip (script-data-escaped-state) ]
+    } cond ;
+
+: script-data-escaped-end-tag-open-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escaped-end-tag-open-state) ;
+
+
+: (script-data-escaped-end-tag-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [
+            drop pick appropriate-end-tag-token?
+            [ before-attribute-name-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
+        ] }
+        { [ dup CHAR: / = ] [
+            drop pick appropriate-end-tag-token?
+            [ self-closing-start-tag-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
+        ] }
+        { [ dup CHAR: > = ] [
+            drop pick appropriate-end-tag-token?
+            [ pick emit-end-tag data-state ] [ "</" reach emit-temporary-buffer-with script-data-escaped-state ] if
+        ] }
+        { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
+        { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-escaped-end-tag-name-state ] }
+        [ [ "</" reach emit-temporary-buffer-with ] dip (script-data-escaped-state) ]
+    } cond ;
+
+: script-data-escaped-end-tag-name-state ( document n/f string -- document n'/f string )
+    take-char (script-data-escaped-end-tag-name-state) ;
+
+
+: (script-data-double-escape-start-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s/>" member? ] [
+            reach emit-char
+            pick temporary-buffer>> "script" sequence=
+            [ script-data-double-escaped-state ] [ script-data-escaped-state ] if
+        ] }
+        { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] }
+        { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] } ! todo
+        [ (script-data-escaped-state) ]
+    } cond ;
+
+: script-data-double-escape-start-state ( document n/f string -- document n'/f string )
+    take-char (script-data-double-escape-start-state) ;
+
+
+: (script-data-double-escaped-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-state ] }
+        { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
+        { [ dup CHAR: \0 = ] [
+            unexpected-null-character
+            CHAR: replacement-character reach emit-char
+            script-data-double-escaped-state
+        ] }
+        { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
+        [ reach emit-char script-data-double-escaped-state ]
+    } cond ;
+
+: script-data-double-escaped-state ( document n/f string -- document n'/f string )
+    take-char (script-data-double-escaped-state) ;
+
+
+: (script-data-double-escaped-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
+        { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
+        { [ dup CHAR: \0 = ] [
+            unexpected-null-character
+            CHAR: replacement-character reach emit-char
+            script-data-double-escaped-state
+        ] }
+        { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
+        [ reach emit-char script-data-double-escaped-state ]
+    } cond ;
+
+: script-data-double-escaped-dash-state ( document n/f string -- document n'/f string )
+    take-char (script-data-double-escaped-dash-state) ;
+
+
+: (script-data-double-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] }
+        { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] }
+        { [ dup CHAR: > = ] [ reach emit-char script-data-state ] }
+        { [ dup CHAR: \0 = ] [
+            unexpected-null-character
+            CHAR: replacement-character reach emit-char
+            script-data-double-escaped-state
+        ] }
+        { [ dup f = ] [ eof-in-script-html-comment-like-text ] }
+        [ reach emit-char script-data-escaped-state ]
+    } cond ;
+
+: script-data-double-escaped-dash-dash-state ( document n/f string -- document n'/f string )
+    take-char (script-data-double-escaped-dash-dash-state) ;
+
+
+: (script-data-double-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: / = ] [ reach emit-char pick reset-temporary-buffer script-data-double-escape-end-state ] }
+        [ (script-data-double-escaped-state) ]
+    } cond ;
+
+: script-data-double-escaped-less-than-sign-state ( document n/f string -- document n'/f string )
+    take-char (script-data-double-escaped-less-than-sign-state) ;
+
+
+: (script-data-double-escape-end-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s/>" member? ] [
+            reach emit-char
+            pick temporary-buffer>> "script" sequence=
+            [ script-data-escaped-state ] [ script-data-double-escaped-state ] if
+        ] }
+        { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] }
+        { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] } ! todo
+        [ (script-data-double-escaped-state) ]
+    } cond ;
+
+: script-data-double-escape-end-state ( document n/f string -- document n'/f string )
+    take-char (script-data-double-escape-end-state) ;
+
+
+: (before-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
+        { [ dup "/>" member? ] [ (after-attribute-name-state) ] }
+        { [ dup f = ] [ (after-attribute-name-state) ] }
+        { [ dup CHAR: = = ] [ unexpected-equals-sign-before-attribute-name ] }
+        [ reach push-attribute (attribute-name-state) ]
+    } cond ;
+
+: before-attribute-name-state ( document n/f string -- document n'/f string )
+    take-char (before-attribute-name-state) ;
+
+
+: (attribute-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s/>" member? ] [ (after-attribute-name-state) ] }
+        { [ dup f = ] [ (after-attribute-name-state) ] }
+        { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
+        { [ dup ascii-upper-alpha? ] [
+            0x20 + reach push-attribute-name
+            attribute-name-state
+        ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
+        { [ dup "\"'<" member? ] [
+            unexpected-character-in-attribute-name
+            reach push-attribute-name attribute-name-state
+        ] }
+        [ reach push-attribute-name attribute-name-state ]
+    } cond ;
+
+: attribute-name-state ( document n/f string -- document n'/f string )
+    take-char (attribute-name-state) ;
+
+
+: (after-attribute-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop after-attribute-name-state ] }
+        { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
+        { [ dup CHAR: = = ] [ drop before-attribute-value-state ] }
+        { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
+        { [ dup f = ] [ eof-in-tag ] }
+        [ [ pick push-attribute ] dip (attribute-name-state) ]
+    } cond ;
+
+: after-attribute-name-state ( document n/f string -- document n'/f string )
+    take-char (after-attribute-name-state) ;
+
+
+: (before-attribute-value-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
+        { [ dup CHAR: " = ] [ drop attribute-value-double-quoted-state ] }
+        { [ dup CHAR: ' = ] [ drop attribute-value-single-quoted-state ] }
+        { [ dup CHAR: > = ] [ drop missing-attribute-value ] }
+        [ (attribute-value-unquoted-state) ]
+    } cond ;
+
+: before-attribute-value-state ( document n/f string -- document n'/f string )
+    take-char (before-attribute-value-state) ;
+
+
+: (attribute-value-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: " = ] [ drop after-attribute-value-quoted-state ] }
+        { [ dup CHAR: & = ] [
+            drop
+            [ \ attribute-value-double-quoted-state >>return-state ] 2dip character-reference-state
+        ] }
+        { [ dup CHAR: \0 = ] [ unexpected-null-character ] }
+        { [ dup f = ] [ eof-in-tag ] }
+        [ reach push-attribute-value attribute-value-double-quoted-state ]
+    } cond ;
+
+: attribute-value-double-quoted-state ( document n/f string -- document n'/f string )
+    take-char (attribute-value-double-quoted-state) ;
+
+
+: (attribute-value-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: ' = ] [ drop after-attribute-value-quoted-state ] }
+        { [ dup CHAR: & = ] [
+            drop [ \ attribute-value-single-quoted-state >>return-state ] 2dip
+            character-reference-state
+        ] }
+        { [ dup CHAR: \0 = ] [
+            drop unexpected-null-character
+            CHAR: replacement-character reach push-attribute-value
+        ] }
+        { [ dup f = ] [ eof-in-tag ] }
+        [ reach push-attribute-value attribute-value-single-quoted-state ]
+    } cond ;
+
+: attribute-value-single-quoted-state ( document n/f string -- document n'/f string )
+    take-char (attribute-value-single-quoted-state) ;
+
+
+: (attribute-value-unquoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
+        { [ dup CHAR: & = ] [
+            drop
+            [ \ attribute-value-unquoted-state >>return-state ] 2dip character-reference-state
+        ] }
+        { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
+        { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-attribute-value ] }
+        { [ dup "\"'<=`" member? ] [
+            unexpected-character-in-unquoted-attribute-value
+            reach push-attribute-value
+            attribute-value-unquoted-state
+        ] }
+        { [ dup f = ] [ eof-in-tag ] }
+        [ reach push-attribute-value attribute-value-unquoted-state ]
+    } cond ;
+
+: attribute-value-unquoted-state ( document n/f string -- document n'/f string )
+    take-char (attribute-value-unquoted-state) ;
+
+
+: (after-attribute-value-quoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] }
+        { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] }
+        { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] }
+        { [ dup f = ] [ eof-in-tag ] }
+        [ missing-whitespace-between-attributes (before-attribute-name-state) ]
+    } cond ;
+
+: after-attribute-value-quoted-state ( document n/f string -- document n'/f string )
+    take-char (after-attribute-value-quoted-state) ;
+
+
+: (self-closing-start-tag-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: > = ] [ drop pick [ set-self-closing ] [ emit-tag ] bi data-state ] }
+        { [ dup f = ] [ eof-in-tag ] }
+        [ unexpected-solidus-in-tag ]
+    } cond ;
+
+: self-closing-start-tag-state ( document n/f string -- document n'/f string )
+    take-char (self-closing-start-tag-state) ;
+
+
+: (bogus-comment-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
+        { [ dup f = ] [ drop pick [ emit-comment-token ] [ emit-eof ] bi ] }
+        { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-comment-token ] }
+        [ reach push-comment-token bogus-comment-state ]
+    } cond ;
+
+: bogus-comment-state ( document n/f string -- document n'/f string )
+    take-char (bogus-comment-state) ;
+
+
+: markup-declaration-open-state ( document n/f string -- document n'/f string )
+    {
+        { [ "--" take-from? ] [ comment-start-state ] }
+        { [ "DOCTYPE" take-from-insensitive? ] [ doctype-state ] }
+        { [ "[CDATA[" take-from-insensitive? ] [ unimplemented* ] }
+        [
+            incorrectly-opened-comment ! bogus-comment-state
+        ]
+    } cond ;
+
+: (comment-start-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop comment-start-dash-state ] }
+        { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment pick emit-comment-token data-state ] }
+        [ (comment-state) ]
+    } cond ;
+
+: comment-start-state ( document n/f string -- document n'/f string )
+    take-char (comment-start-state) ;
+
+
+: (comment-start-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop comment-end-state ] }
+        { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment ] }
+        { [ dup f = ] [ eof-in-comment ] }
+        [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
+    } cond ;
+
+: comment-start-dash-state ( document n/f string -- document n'/f string )
+    take-char (comment-start-dash-state) ;
+
+
+: (comment-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
+        { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
+        { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] }
+        { [ dup f = ] [ eof-in-comment ] }
+        [ reach push-comment-token comment-state ]
+    } cond ;
+
+: comment-state ( document n/f string -- document n'/f string )
+    take-char (comment-state) ;
+
+
+: (comment-less-than-sign-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: ! = ] [ reach push-comment-token comment-less-than-sign-bang-state ] }
+        { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] }
+        [ (comment-state) ]
+    } cond ;
+
+: comment-less-than-sign-state ( document n/f string -- document n'/f string )
+    take-char (comment-less-than-sign-state) ;
+
+
+: (comment-less-than-sign-bang-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ reach push-comment-token comment-less-than-sign-bang-dash-state ] }
+        [ (comment-state) ]
+    } cond ;
+
+: comment-less-than-sign-bang-state ( document n/f string -- document n'/f string )
+    take-char (comment-less-than-sign-bang-state) ;
+
+
+: (comment-less-than-sign-bang-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop comment-less-than-sign-bang-dash-dash-state ] }
+        [ (comment-end-dash-state) ]
+    } cond ;
+
+: comment-less-than-sign-bang-dash-state ( document n/f string -- document n'/f string )
+    take-char (comment-less-than-sign-bang-dash-state) ;
+
+
+: (comment-less-than-sign-bang-dash-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: > = ] [ (comment-end-state) ] }
+        { [ dup f = ] [ (comment-end-state) ] }
+        [ nested-comment (comment-end-state) ]
+    } cond ;
+
+: comment-less-than-sign-bang-dash-dash-state ( document n/f string -- document n'/f string )
+    take-char (comment-less-than-sign-bang-dash-dash-state) ;
+
+
+: (comment-end-dash-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop comment-end-state ] }
+        { [ dup f = ] [ eof-in-comment ] }
+        [ [ CHAR: - reach push-comment-token ] dip (comment-state) ]
+    } cond ;
+
+: comment-end-dash-state ( document n/f string -- document n'/f string )
+    take-char (comment-end-dash-state) ;
+
+
+: (comment-end-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] }
+        { [ dup CHAR: ! = ] [ drop comment-end-bang-state ] }
+        { [ dup CHAR: - = ] [ reach push-comment-token comment-end-state ] }
+        { [ dup f = ] [ drop eof-in-comment pick [ emit-comment-token ] [ emit-eof ] bi ] }
+        [ [ "--" reach push-all-comment-token ] dip (comment-state) ]
+    } cond ;
+
+: comment-end-state ( document n/f string -- document n'/f string )
+    take-char (comment-end-state) ;
+
+
+: (comment-end-bang-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: - = ] [ drop comment-end-dash-state ] }
+        { [ dup CHAR: > = ] [ drop incorrectly-closed-comment pick emit-comment-token data-state ] }
+        { [ dup f = ] [ eof-in-comment ] }
+        [ [ "--!" reach push-all-comment-token ] dip (comment-state) ]
+    } cond ;
+
+: comment-end-bang-state ( document n/f string -- document n'/f string )
+    take-char (comment-end-bang-state) ;
+
+
+: (doctype-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
+        { [ dup CHAR: > = ] [ (before-doctype-name-state) ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
+        [ missing-whitespace-before-doctype-name ]
+    } cond ;
+
+: doctype-state ( document n/f string -- document n'/f string )
+    take-char (doctype-state) ;
+
+
+: (before-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] }
+        { [ dup ascii-upper-alpha? ] [ 0x20 + reach new-doctype-from-ch doctype-name-state ] }
+        { [ dup CHAR: \0 = ] [
+            drop
+            unexpected-null-character
+            CHAR: replacement-character reach new-doctype-from-ch
+            doctype-name-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop missing-doctype-name
+            pick [ new-doctype-with-quirks ] [ emit-doctype ] bi
+        ] }
+        { [ dup f = ] [
+            drop eof-in-doctype
+            pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri
+        ] }
+        [ reach new-doctype-from-ch doctype-name-state ]
+    } cond ;
+
+: before-doctype-name-state ( document n/f string -- document n'/f string )
+    take-char (before-doctype-name-state) ;
+
+
+: (doctype-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
+        { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
+        { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-doctype-name doctype-name-state ] }
+        { [ dup CHAR: \0 = ] [
+            drop unexpected-null-character
+            CHAR: replacement-character pick push-doctype-name
+            doctype-name-state
+        ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } ! force-quirks on for doctype
+        [ reach push-doctype-name doctype-name-state ]
+    } cond ;
+
+: doctype-name-state ( document n/f string -- document n'/f string )
+    take-char (doctype-name-state) ;
+
+
+: (after-doctype-name-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] }
+        { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
+        { [ dup f = ] [ eof-in-doctype ] }
+        { [ [ "PUBLIC" take-from-insensitive? ] dip swap ] [ drop after-doctype-public-keyword-state ] }
+        { [ [ "SYSTEM" take-from-insensitive? ] dip swap ] [ drop after-doctype-system-keyword-state ] }
+        [ invalid-character-sequence-after-doctype-name ]
+    } cond ;
+
+: after-doctype-name-state ( document n/f string -- document n'/f string )
+    take-char (after-doctype-name-state) ;
+
+
+: (after-doctype-public-keyword-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
+        { [ dup CHAR: " = ] [ missing-whitespace-after-doctype-public-keyword ] }
+        { [ dup CHAR: ' = ] [ missing-whitespace-after-doctype-public-keyword ] }
+        { [ dup CHAR: > = ] [ drop missing-doctype-public-identifier force-quirks data-state ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
+        [
+            missing-quote-before-doctype-public-identifier
+            [ reach force-quirks ] dip
+            (bogus-doctype-state)
+        ]
+    } cond ;
+
+: after-doctype-public-keyword-state ( document n/f string -- document n'/f string )
+    take-char (after-doctype-public-keyword-state) ;
+
+
+: (before-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] }
+        { [ dup CHAR: " = ] [
+            drop pick initialize-doctype-public-identifier
+            doctype-public-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: ' = ] [
+            drop pick initialize-doctype-public-identifier
+            doctype-public-identifier-single-quoted-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop missing-doctype-public-identifier
+            pick [ force-quirks ] [ emit-doctype ] bi
+            data-state
+        ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
+        [
+            missing-quote-before-doctype-public-identifier
+            [ reach force-quirks ] dip
+            (bogus-doctype-state)
+        ]
+    } cond ;
+
+: before-doctype-public-identifier-state ( document n/f string -- document n'/f string )
+    take-char (before-doctype-public-identifier-state) ;
+
+
+: (doctype-public-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: " = ] [ drop after-doctype-public-identifier-state ] }
+        { [ dup CHAR: \0 = ] [
+            drop
+            unexpected-null-character
+            CHAR: replacement-character pick push-doctype-public-identifier
+            doctype-public-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop abrupt-doctype-public-identifier
+            pick [ force-quirks ] [ emit-doctype ] bi
+            data-state
+        ] }
+        { [ dup f = ] [
+            drop eof-in-doctype
+            pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
+        ] }
+        [ reach push-doctype-public-identifier doctype-public-identifier-double-quoted-state ]
+    } cond ;
+
+: doctype-public-identifier-double-quoted-state ( document n/f string -- document n'/f string )
+    take-char (doctype-public-identifier-double-quoted-state) ;
+
+
+: (doctype-public-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: ' = ] [ drop after-doctype-public-identifier-state ] }
+        { [ dup CHAR: \0 = ] [
+            drop
+            unexpected-null-character
+            CHAR: replacement-character pick push-doctype-public-identifier
+            doctype-public-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop abrupt-doctype-public-identifier
+            pick [ force-quirks ] [ emit-doctype ] bi
+            data-state
+        ] }
+        { [ dup f = ] [
+            drop eof-in-doctype
+            pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
+        ] }
+        [ reach push-doctype-public-identifier doctype-public-identifier-single-quoted-state ]
+    } cond ;
+
+: doctype-public-identifier-single-quoted-state ( document n/f string -- document n'/f string )
+    take-char (doctype-public-identifier-single-quoted-state) ;
+
+
+: (after-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
+        { [ dup CHAR: > = ] [
+            drop pick emit-doctype
+            data-state
+        ] }
+        { [ dup CHAR: " = ] [
+            drop missing-whitespace-between-doctype-public-and-system-identifiers
+            pick initialize-doctype-system-identifier
+            doctype-system-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: ' = ] [
+            drop missing-whitespace-between-doctype-public-and-system-identifiers
+            pick initialize-doctype-system-identifier
+            doctype-system-identifier-single-quoted-state
+        ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
+        [
+            missing-quote-before-doctype-system-identifier
+            [ reach force-quirks ] dip
+            (bogus-doctype-state)
+        ]
+    } cond ;
+
+: after-doctype-public-identifier-state ( document n/f string -- document n'/f string )
+    take-char (after-doctype-public-identifier-state) ;
+
+
+: (between-doctype-public-and-system-identifiers-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
+        { [ dup CHAR: > = ] [
+            drop pick emit-doctype
+            data-state
+        ] }
+        { [ dup CHAR: " = ] [
+            drop pick initialize-doctype-system-identifier
+            doctype-system-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: ' = ] [
+            drop pick initialize-doctype-system-identifier
+            doctype-system-identifier-single-quoted-state
+        ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
+        [
+            missing-quote-before-doctype-system-identifier
+            [ reach force-quirks ] dip
+            (bogus-doctype-state)
+        ]
+    } cond ;
+
+: between-doctype-public-and-system-identifiers-state ( document n/f string -- document n'/f string )
+    take-char (between-doctype-public-and-system-identifiers-state) ;
+
+
+: (after-doctype-system-keyword-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] }
+        { [ dup CHAR: " = ] [
+            drop missing-whitespace-after-doctype-system-keyword
+            pick initialize-doctype-system-identifier
+            doctype-system-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: ' = ] [
+            drop missing-whitespace-after-doctype-system-keyword
+            pick initialize-doctype-system-identifier
+            doctype-system-identifier-single-quoted-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop missing-doctype-system-identifier
+            pick [ force-quirks ] [ emit-doctype ] bi
+            data-state
+        ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
+        [
+            missing-quote-before-doctype-system-identifier
+            [ reach force-quirks ] dip
+            (bogus-doctype-state)
+        ]
+    } cond ;
+
+: after-doctype-system-keyword-state ( document n/f string -- document n'/f string )
+    take-char (after-doctype-system-keyword-state) ;
+
+
+: (before-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop before-doctype-system-identifier-state ] }
+        { [ dup CHAR: " = ] [
+            drop pick initialize-doctype-system-identifier
+            doctype-system-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: ' = ] [
+            drop pick initialize-doctype-system-identifier
+            doctype-system-identifier-single-quoted-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop missing-doctype-system-identifier
+            pick [ force-quirks ] [ emit-doctype ] bi
+            data-state
+        ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] }
+        [
+            missing-quote-before-doctype-system-identifier
+            [ reach force-quirks ] dip
+            (bogus-doctype-state)
+        ]
+    } cond ;
+
+: before-doctype-system-identifier-state ( document n/f string -- document n'/f string )
+    take-char (before-doctype-system-identifier-state) ;
+
+
+: (doctype-system-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: " = ] [ drop after-doctype-system-identifier-state ] }
+        { [ dup CHAR: \0 = ] [
+            drop
+            unexpected-null-character
+            CHAR: replacement-character pick push-doctype-system-identifier
+            doctype-system-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop abrupt-doctype-system-identifier
+            pick [ force-quirks ] [ emit-doctype ] bi
+            data-state
+        ] }
+        { [ dup f = ] [
+            drop eof-in-doctype
+            pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
+        ] }
+        [ reach push-doctype-system-identifier doctype-system-identifier-double-quoted-state ]
+    } cond ;
+
+: doctype-system-identifier-double-quoted-state ( document n/f string -- document n'/f string )
+    take-char (doctype-system-identifier-double-quoted-state) ;
+
+
+: (doctype-system-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: ' = ] [ drop after-doctype-system-identifier-state ] }
+        { [ dup CHAR: \0 = ] [
+            drop
+            unexpected-null-character
+            CHAR: replacement-character pick push-doctype-system-identifier
+            doctype-system-identifier-double-quoted-state
+        ] }
+        { [ dup CHAR: > = ] [
+            drop abrupt-doctype-system-identifier
+            pick [ force-quirks ] [ emit-doctype ] bi
+            data-state
+        ] }
+        { [ dup f = ] [
+            drop eof-in-doctype
+            pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri
+        ] }
+        [ reach push-doctype-system-identifier doctype-system-identifier-single-quoted-state ]
+    } cond ;
+
+: doctype-system-identifier-single-quoted-state ( document n/f string -- document n'/f string )
+    take-char (doctype-system-identifier-single-quoted-state) ;
+
+
+: (after-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "\t\n\f\s" member? ] [ drop after-doctype-system-identifier-state ] }
+        { [ dup CHAR: > = ] [
+            drop pick emit-doctype
+            data-state
+        ] }
+        { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] }
+        [
+            unexpected-character-after-doctype-system-identifier
+            [ reach force-quirks ] dip
+            (bogus-doctype-state)
+        ]
+    } cond ;
+
+: after-doctype-system-identifier-state ( document n/f string -- document n'/f string )
+    take-char (after-doctype-system-identifier-state) ;
+
+
+: (bogus-doctype-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] }
+        { [ dup CHAR: \0 = ] [ drop unexpected-null-character bogus-doctype-state ] }
+        { [ dup f = ] [ drop eof-in-doctype pick emit-eof ] }
+        [ drop bogus-doctype-state ]
+    } cond ;
+
+: bogus-doctype-state ( document n/f string -- document n'/f string )
+    take-char (bogus-doctype-state) ;
+
+
+: (cdata-section-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: ] = ] [ drop cdata-section-bracket-state ] }
+        { [ dup f = ] [ drop eof-in-cdata pick emit-eof ] }
+        [ reach emit-char cdata-section-state ]
+    } cond ;
+
+: cdata-section-state ( document n/f string -- document n'/f string )
+    take-char (cdata-section-state) ;
+
+
+: (cdata-section-bracket-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: ] = ] [ drop cdata-section-end-state ] }
+        [ [ CHAR: ] reach emit-char ] dip (cdata-section-state) ]
+    } cond ;
+
+: cdata-section-bracket-state ( document n/f string -- document n'/f string )
+    take-char (cdata-section-bracket-state) ;
+
+
+: (cdata-section-end-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup CHAR: ] = ] [ reach emit-char cdata-section-end-state ] }
+        { [ dup CHAR: > = ] [ drop data-state ] }
+        [ [ "]]" reach emit-string ] dip (cdata-section-state) ]
+    } cond ;
+
+: cdata-section-end-state ( document n/f string -- document n'/f string )
+    take-char (cdata-section-end-state) ;
+
+
+: (character-reference-state) ( document n/f string ch/f -- document n'/f string )
+    [ CHAR: & reach ch>new-temporary-buffer ] dip
+    {
+        { [ dup ascii-alphanumeric? ] [ (named-character-reference-state) ] }
+        { [ dup CHAR: # = ] [ reach push-temporary-buffer numeric-character-reference-state ] }
+        [ reach flush-temporary-buffer (return-state) ]
+    } cond ;
+
+: character-reference-state ( document n/f string -- document n'/f string )
+    take-char (character-reference-state) ;
+
+
+: (named-character-reference-state) ( document n/f string ch/f -- document n'/f string )
+    reach push-temporary-buffer
+    pick named-character-match?
+    [
+        drop ! exact match, drop prefix match
+        ! XXX: check me
+        {
+            [ pick temporary-buffer-attribute? ]
+            [ pick temporary-buffer>> ?last CHAR: ; = not ]
+            [ 3dup peek-from { [ CHAR: = = ] [ ascii-alphanumeric? ] } 1|| ]
+        } 0&& [
+            unimplemented*
+            flush-temporary-buffer
+            return-state
+        ] [
+            pick [ take-named-character ] [ flush-temporary-buffer ] bi return-state
+        ] if
+    ] [
+        ! prefix match?
+        [ named-character-reference-state ]
+        [ pick flush-temporary-buffer ambiguous-ampersand-state ] if
+    ] if ;
+
+: named-character-reference-state ( document n/f string -- document n'/f string )
+    take-char (named-character-reference-state) ;
+
+
+: (ambiguous-ampersand-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-alphanumeric? ] [
+            unimplemented*
+        ] }
+        { [ dup CHAR: ; = ] [ unknown-named-character-reference (return-state) ] }
+        [ (return-state) ]
+    } cond ;
+
+: ambiguous-ampersand-state ( document n/f string -- document n'/f string )
+    take-char (ambiguous-ampersand-state) ;
+
+
+: (numeric-character-reference-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup "xX" member? ] [ reach push-temporary-buffer hexadecimal-character-reference-start-state ] }
+        [ (decimal-character-reference-start-state) ]
+    } cond ;
+
+: numeric-character-reference-state ( document n/f string -- document n'/f string )
+    take-char (numeric-character-reference-state) ;
+
+
+: (hexadecimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-hex-digit? ] [ (hexadecimal-character-reference-state) ] }
+        [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
+    } cond ;
+
+: hexadecimal-character-reference-start-state ( document n/f string -- document n'/f string )
+    take-char (hexadecimal-character-reference-start-state) ;
+
+
+: (decimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-digit? ] [ (decimal-character-reference-state) ] }
+        [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ]
+    } cond ;
+
+: decimal-character-reference-start-state ( document n/f string -- document n'/f string )
+    take-char (decimal-character-reference-start-state) ;
+
+
+: (hexadecimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-digit? ] [ unimplemented* ] }
+        { [ dup ascii-upper-hex-digit? ] [ unimplemented* ] }
+        { [ dup ascii-lower-hex-digit? ] [ unimplemented* ] }
+        { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
+        [ missing-semicolon-after-character-reference ]
+    } cond ;
+
+: hexadecimal-character-reference-state ( document n/f string -- document n'/f string )
+    take-char (hexadecimal-character-reference-state) ;
+
+
+: (decimal-character-reference-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        { [ dup ascii-digit? ] [ unimplemented* ] }
+        { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] }
+        [ missing-semicolon-after-character-reference ]
+    } cond ;
+
+: decimal-character-reference-state ( document n/f string -- document n'/f string )
+    take-char (decimal-character-reference-state) ;
+
+
+: (numeric-character-reference-end-state) ( document n/f string ch/f -- document n'/f string )
+    {
+        [ missing-semicolon-after-character-reference ]
+    } cond ;
+
+: numeric-character-reference-end-state ( document n/f string -- document n'/f string )
+    take-char (numeric-character-reference-end-state) ;
+
+
+
+: parse-html5 ( string -- document )
+    [ <document> 0 ] dip data-state 2drop ;
\ No newline at end of file