! Copyright (C) 2020 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators combinators.short-circuit io io.encodings.utf8 io.files json.reader kernel math math.order memoize modern.slices prettyprint sequences sequences.extras strings suffix-arrays words ; IN: html5 : 1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip over push ; inline : ?1sbuf ( ch -- sbuf ) [ SBUF" " clone ] dip [ over push ] when* ; inline ! https://html.spec.whatwg.org/multipage/parsing.html#tokenization ! https://infra.spec.whatwg.org/#namespaces CONSTANT: html-namespace "http://www.w3.org/1999/xhtml" CONSTANT: mathml-namespace "http://www.w3.org/1998/Math/MathML" CONSTANT: svg-namespace "http://www.w3.org/2000/svg" CONSTANT: xlink-namespace "http://www.w3.org/1999/xlink" CONSTANT: xml-namespace "http://www.w3.org/XML/1998/namespace" CONSTANT: xmlns-namespace "http://www.w3.org/2000/xmlns/" DEFER: data-state DEFER: (data-state) DEFER: rcdata-state DEFER: (rcdata-state) DEFER: rawtext-state DEFER: (rawtext-state) DEFER: script-data-state DEFER: (script-data-state) DEFER: plaintext-state DEFER: (plaintext-state) DEFER: tag-open-state DEFER: (tag-open-state) DEFER: end-tag-open-state DEFER: (end-tag-open-state) DEFER: tag-name-state DEFER: (tag-name-state) DEFER: rcdata-less-than-sign-state DEFER: (rcdata-less-than-sign-state) DEFER: rcdata-end-tag-open-state DEFER: (rcdata-end-tag-open-state) DEFER: rcdata-end-tag-name-state DEFER: (rcdata-end-tag-name-state) DEFER: rawtext-less-than-sign-state DEFER: (rawtext-less-than-sign-state) DEFER: rawtext-end-tag-open-state DEFER: (rawtext-end-tag-open-state) DEFER: rawtext-end-tag-name-state DEFER: (rawtext-end-tag-name-state) DEFER: script-data-less-than-sign-state DEFER: (script-data-less-than-sign-state) DEFER: script-data-end-tag-open-state DEFER: (script-data-end-tag-open-state) DEFER: script-data-end-tag-name-state DEFER: (script-data-end-tag-name-state) DEFER: script-data-escape-start-state DEFER: (script-data-escape-start-state) DEFER: script-data-escape-start-dash-state DEFER: (script-data-escape-start-dash-state) DEFER: script-data-escaped-state DEFER: (script-data-escaped-state) DEFER: script-data-escaped-dash-state DEFER: (script-data-escaped-dash-state) DEFER: script-data-escaped-dash-dash-state DEFER: (script-data-escaped-dash-dash-state) DEFER: script-data-escaped-less-than-sign-state DEFER: (script-data-escaped-less-than-sign-state) DEFER: script-data-escaped-end-tag-open-state DEFER: (script-data-escaped-end-tag-open-state) DEFER: script-data-escaped-end-tag-name-state DEFER: (script-data-escaped-end-tag-name-state) DEFER: script-data-double-escape-start-state DEFER: (script-data-double-escape-start-state) DEFER: script-data-double-escaped-state DEFER: (script-data-double-escaped-state) DEFER: script-data-double-escaped-dash-state DEFER: (script-data-double-escaped-dash-state) DEFER: script-data-double-escaped-dash-dash-state DEFER: (script-data-double-escaped-dash-dash-state) DEFER: script-data-double-escaped-less-than-sign-state DEFER: (script-data-double-escaped-less-than-sign-state) DEFER: script-data-double-escape-end-state DEFER: (script-data-double-escape-end-state) DEFER: before-attribute-name-state DEFER: (before-attribute-name-state) DEFER: attribute-name-state DEFER: (attribute-name-state) DEFER: after-attribute-name-state DEFER: (after-attribute-name-state) DEFER: before-attribute-value-state DEFER: (before-attribute-value-state) DEFER: attribute-value-double-quoted-state DEFER: (attribute-value-double-quoted-state) DEFER: attribute-value-single-quoted-state DEFER: (attribute-value-single-quoted-state) DEFER: attribute-value-unquoted-state DEFER: (attribute-value-unquoted-state) DEFER: after-attribute-value-quoted-state DEFER: (after-attribute-value-quoted-state) DEFER: self-closing-start-tag-state DEFER: (self-closing-start-tag-state) DEFER: bogus-comment-state DEFER: (bogus-comment-state) DEFER: markup-declaration-open-state DEFER: (markup-declaration-open-state) DEFER: comment-start-state DEFER: (comment-start-state) DEFER: comment-start-dash-state DEFER: (comment-start-dash-state) DEFER: comment-state DEFER: (comment-state) DEFER: comment-less-than-sign-state DEFER: (comment-less-than-sign-state) DEFER: comment-less-than-sign-bang-state DEFER: (comment-less-than-sign-bang-state) DEFER: comment-less-than-sign-bang-dash-state DEFER: (comment-less-than-sign-bang-dash-state) DEFER: comment-less-than-sign-bang-dash-dash-state DEFER: (comment-less-than-sign-bang-dash-dash-state) DEFER: comment-end-dash-state DEFER: (comment-end-dash-state) DEFER: comment-end-state DEFER: (comment-end-state) DEFER: comment-end-bang-state DEFER: (comment-end-bang-state) DEFER: doctype-state DEFER: (doctype-state) DEFER: before-doctype-name-state DEFER: (before-doctype-name-state) DEFER: doctype-name-state DEFER: (doctype-name-state) DEFER: after-doctype-name-state DEFER: (after-doctype-name-state) DEFER: after-doctype-public-keyword-state DEFER: (after-doctype-public-keyword-state) DEFER: before-doctype-public-identifier-state DEFER: (before-doctype-public-identifier-state) DEFER: doctype-public-identifier-double-quoted-state DEFER: (doctype-public-identifier-double-quoted-state) DEFER: doctype-public-identifier-single-quoted-state DEFER: (doctype-public-identifier-single-quoted-state) DEFER: after-doctype-public-identifier-state DEFER: (after-doctype-public-identifier-state) DEFER: between-doctype-public-and-system-identifiers-state DEFER: (between-doctype-public-and-system-identifiers-state) DEFER: after-doctype-system-keyword-state DEFER: (after-doctype-system-keyword-state) DEFER: before-doctype-system-identifier-state DEFER: (before-doctype-system-identifier-state) DEFER: doctype-system-identifier-double-quoted-state DEFER: (doctype-system-identifier-double-quoted-state) DEFER: doctype-system-identifier-single-quoted-state DEFER: (doctype-system-identifier-single-quoted-state) DEFER: after-doctype-system-identifier-state DEFER: (after-doctype-system-identifier-state) DEFER: bogus-doctype-state DEFER: (bogus-doctype-state) DEFER: cdata-section-state DEFER: (cdata-section-state) DEFER: cdata-section-bracket-state DEFER: (cdata-section-bracket-state) DEFER: cdata-section-end-state DEFER: (cdata-section-end-state) DEFER: character-reference-state DEFER: (character-reference-state) DEFER: named-character-reference-state DEFER: (named-character-reference-state) DEFER: ambiguous-ampersand-state DEFER: (ambiguous-ampersand-state) DEFER: numeric-character-reference-state DEFER: (numeric-character-reference-state) DEFER: hexadecimal-character-reference-start-state DEFER: (hexadecimal-character-reference-start-state) DEFER: decimal-character-reference-start-state DEFER: (decimal-character-reference-start-state) DEFER: hexadecimal-character-reference-state DEFER: (hexadecimal-character-reference-state) DEFER: decimal-character-reference-state DEFER: (decimal-character-reference-state) DEFER: numeric-character-reference-end-state DEFER: (numeric-character-reference-end-state) ERROR: unimplemented string ; ERROR: unimplemented* ; ! Errors: https://html.spec.whatwg.org/multipage/parsing.html#parse-errors ERROR: abrupt-closing-of-empty-comment ; ERROR: abrupt-doctype-public-identifier ; ERROR: abrupt-doctype-system-identifier ; ERROR: absence-of-digits-in-numeric-character-reference ; ERROR: cdata-in-html-content ; ERROR: character-reference-outside-unicode-range ; ERROR: control-character-in-input-stream ; ERROR: control-character-reference ; ERROR: end-tag-with-attributes ; ERROR: duplicate-attribute ; ERROR: end-tag-with-trailing-solidus ; ERROR: eof-before-tag-name ; ERROR: eof-in-cdata ; ERROR: eof-in-comment ; ERROR: eof-in-doctype ; ERROR: eof-in-script-html-comment-like-text ; ERROR: eof-in-tag ; ERROR: incorrectly-closed-comment ; ERROR: incorrectly-opened-comment ; ERROR: invalid-character-sequence-after-doctype-name ; ERROR: invalid-first-character-of-tag-name ; ERROR: missing-attribute-value ; ERROR: missing-doctype-name ; ERROR: missing-doctype-public-identifier ; ERROR: missing-doctype-system-identifier ; ERROR: missing-end-tag-name ; ERROR: missing-quote-before-doctype-public-identifier ; ERROR: missing-quote-before-doctype-system-identifier ; ERROR: missing-semicolon-after-character-reference ; ERROR: missing-whitespace-after-doctype-public-keyword ; ERROR: missing-whitespace-after-doctype-system-keyword ; ERROR: missing-whitespace-before-doctype-name ; ERROR: missing-whitespace-between-attributes ; ERROR: missing-whitespace-between-doctype-public-and-system-identifiers ; ERROR: nested-comment ; ERROR: noncharacter-character-reference ; ERROR: noncharacter-in-input-stream ; ERROR: non-void-html-element-start-tag-with-trailing-solidus ; ERROR: null-character-reference ; ERROR: surrogate-character-reference ; ERROR: surrogate-in-input-stream ; ERROR: unexpected-character-after-doctype-system-identifier ; ERROR: unexpected-character-in-attribute-name ; ERROR: unexpected-character-in-unquoted-attribute-value ; ERROR: unexpected-equals-sign-before-attribute-name ; ERROR: unexpected-null-character ; ERROR: unexpected-question-mark-instead-of-tag-name ; ERROR: unexpected-solidus-in-tag ; ERROR: unknown-named-character-reference ; ! Tree insertion modes SINGLETONS: initial-mode before-html-mode before-head-mode in-head-mode in-head-noscript-mode after-head-mode in-body-mode text-mode in-table-mode in-table-text-mode in-caption-mode in-column-group-mode in-table-body-mode in-row-mode in-cell-mode in-select-mode in-select-in-table-mode in-template-mode after-body-mode in-frameset-mode after-frameset-mode after-after-body-mode after-after-frameset-mode ; TUPLE: document quirks-mode? limited-quirks-mode? iframe-srcdoc? scripting? ! set in constructor frameset-ok? ! frameset-ok? but we want default to f fostering-parent? tree tree-doctype head-element-pointer ! set during insertion time parser-cannot-change-mode-flag insertion-mode original-insertion-mode last node context doctype tag end-tag tag-name end-tag-name attribute-name attribute-value temporary-buffer comment-token open-elements return-state ; ! "reset the insertion mode appropriately" ! : reset-insertion-mode ( document -- document ) ! f >>last ! dup open-elements>> ?last >>node ! dup [ open-elements>> ?first ] [ node>> ] bi = [ ! t >>last dup node>> >>context ! ] when ! dup node>> { ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ ! dup name>> >lower { "td" "th" } member? ! pick last>> f = and ! ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! { [ dup name>> >lower "select" = ] [ drop in-select >>insertion-mode ] } ! } cond ! ; : temporary-buffer-attribute? ( document -- ? ) return-state>> { attribute-value-unquoted-state attribute-value-single-quoted-state attribute-value-double-quoted-state } member? ; ! name, public/system identifier should not be empty strings ! until the state machine demands it TUPLE: doctype name public-identifier system-identifier quirks? ; : ( -- doctype ) doctype new ; inline : new-doctype-from-ch ( ch document -- ) [ doctype new swap ?1sbuf >>name ] dip doctype<< ; inline : new-doctype-with-quirks ( document -- ) t >>quirks? >>doctype drop ; TUPLE: tag self-closing? name attributes children end-tag ; : ( -- tag ) tag new SBUF" " clone >>name V{ } clone >>attributes V{ } clone >>children ; TUPLE: end-tag self-closing? name attributes ; : ( -- tag ) end-tag new SBUF" " clone >>name V{ } clone >>attributes ; : new-tag ( document -- ) >>tag drop ; : new-end-tag ( document -- ) >>tag drop ; : set-self-closing ( document -- ) tag>> t >>self-closing? drop ; : ( -- document ) document new V{ } clone >>tree initial-mode >>insertion-mode >>doctype t >>frameset-ok? ! SBUF" " clone >>tag-name SBUF" " clone >>attribute-name SBUF" " clone >>attribute-value SBUF" " clone >>temporary-buffer SBUF" " clone >>comment-token V{ } clone >>open-elements ; inline TUPLE: comment open payload close ; : ( payload -- comment ) comment new swap >>payload ; inline : force-quirks ( document -- ) doctype>> t >>quirks? drop ; : initialize-doctype-name ( document -- ) [ SBUF" " clone ] dip doctype>> name<< ; : initialize-doctype-public-identifier ( document -- ) [ SBUF" " clone ] dip doctype>> public-identifier<< ; : initialize-doctype-system-identifier ( document -- ) [ SBUF" " clone ] dip doctype>> system-identifier<< ; : push-doctype-name ( ch document -- ) doctype>> name>> push ; : push-doctype-public-identifier ( ch document -- ) doctype>> public-identifier>> push ; : push-doctype-system-identifier ( ch document -- ) doctype>> system-identifier>> push ; ! XXX: not html5 spec, fix ERROR: unmatched-closing-tag-error stack tag ; : unclosed-tag? ( obj -- ? ) { [ tag? ] [ end-tag>> not ] } 1&& ; inline :: find-matching-tag ( name stack -- seq ) stack [ { [ unclosed-tag? ] [ name>> name = ] } 1&& ] find-last drop [ stack swap shorten* ] [ stack name unmatched-closing-tag-error ] if* ; DEFER: tree-insert GENERIC: tree-insert* ( document obj insertion-mode -- document ) : limited-quirks-mode? ( doctype -- ? ) { [ public-identifier>> "-//W3C//DTD XHTML 1.0 Frameset//" head? ] [ public-identifier>> "-//W3C//DTD XHTML 1.0 Transitional//" head? ] [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Frameset//" head? ] } 1&& ] [ { [ system-identifier>> ] [ public-identifier>> "-//W3C//DTD HTML 4.01 Transitional//" head? ] } 1&& ] } 1|| ; ! https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode M: initial-mode tree-insert* drop { { [ dup "\t\n\f\r\s" member? ] [ drop ] } { [ dup doctype? ] [ >>tree-doctype before-html-mode >>insertion-mode ] } { [ dup comment? ] [ over tree>> push ] } { [ dup tag? ] [ over tree>> push ] } { [ dup end-tag? ] [ dup name>> pick tree>> find-matching-tag unclip swap >>children swap >>end-tag over tree>> push ] } [ over iframe-srcdoc?>> [ over parser-cannot-change-mode-flag>> [ [ t >>quirks-mode? ] dip ] unless ] [ "must be iframe-srcdoc here" throw ] if ! reprocess the token before-html-mode >>insertion-mode tree-insert ] } cond ; ! https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode M: before-html-mode tree-insert* drop { { [ dup doctype? ] [ drop ] } { [ dup comment? ] [ over tree>> push ] } { [ dup "\t\n\f\r\s" member? ] [ drop ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ over tree>> push before-head-mode >>insertion-mode ] } ! these tags are handled in the default case { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [ ! error end-tag, ignore drop ] } [ ! Create missing html tag and reprocess the token "html" >>name pick tree>> push before-head-mode >>insertion-mode tree-insert ] } cond ; M: before-head-mode tree-insert* drop { { [ dup "\t\n\f\r\s" member? ] [ drop ] } { [ dup comment? ] [ over tree>> push ] } { [ dup doctype? ] [ drop ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ ! XXX: in-body-mode rules here for html tag ! B ! over tree>> push ! before-head-mode >>insertion-mode "handle html in-body-mode here" throw ] } { [ dup { [ tag? ] [ name>> "head" = ] } 1&& ] [ [ swap tree>> push ] [ >>head-element-pointer drop ] [ drop in-head-mode >>insertion-mode ] 2tri ] } ! these tags are handled in the default case { [ dup { [ end-tag? ] [ name>> { "head" "body" "html" "br" } member? not ] } 1&& ] [ ! error end-tag, ignore drop ] } ! ignore tag { [ dup tag? ] [ drop ] } [ ! Create missing html tag and reprocess the token [ "head" >>name pick tree>> push ] [ >>head-element-pointer ] bi in-head-mode >>insertion-mode tree-insert ] } cond ; M: in-head-mode tree-insert* drop { { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] } { [ dup comment? ] [ over tree>> push ] } { [ dup doctype? ] [ drop ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ ! XXX: in-body-mode rules here for html tag ! B ! over tree>> push ! before-head-mode >>insertion-mode "handle html in-body-mode here" throw unimplemented* ] } { [ dup { [ tag? ] [ name>> { "base" "basefont" "bgsound" "link" } member? ] } 1&& ] [ ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing unimplemented* ] } { [ dup { [ tag? ] [ name>> "meta" = ] } 1&& ] [ unimplemented* ] } { [ dup { [ tag? ] [ name>> "title" = ] } 1&& ] [ ! https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm "insert title node" throw unimplemented* ] } { [ dup { [ { [ tag? ] [ name>> "noscript" = ] [ scripting?>> ] } 1&& ] [ { [ tag? ] [ name>> { "noframes" "style" } member? ] } 1&& ] } 1|| ] [ ! https://html.spec.whatwg.org/multipage/parsing.html#generic-raw-text-element-parsing-algorithm unimplemented* ] } { [ dup { [ tag? ] [ name>> "noscript" = ] [ scripting?>> not ] } 1&& ] [ unimplemented* over tree>> push in-head-noscript-mode >>insertion-mode ] } { [ dup { [ tag? ] [ name>> "script" = ] } 1&& ] [ unimplemented* text-mode >>insertion-mode ] } { [ dup { [ end-tag? ] [ name>> "head" = ] } 1&& ] [ over tree>> last end-tag<< after-head-mode >>insertion-mode ] } { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? ] } 1&& ] [ ! non-void-html-element-start-tag-with-trailing-solidus soft error if not self-closing unimplemented* ] } { [ dup { [ tag? ] [ name>> "template" = ] } 1&& ] [ unimplemented* in-template-mode >>insertion-mode ] } { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [ unimplemented* ] } ! XXX: revisit this { [ dup { [ { [ tag? ] [ name>> "head" = ] } 1&& ] [ end-tag? ] } 1|| ] [ drop "ignore here" throw ] } [ ! end head tag should be here, pop off, reprocess over tree>> pop swap >>end-tag after-head-mode >>insertion-mode "omg" throw ] } cond ; M: in-head-noscript-mode tree-insert* drop unimplemented* ; M: after-head-mode tree-insert* drop { { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] } { [ dup comment? ] [ over tree>> push ] } { [ dup doctype? ] [ drop ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ ! XXX: in-body-mode rules here for html tag ! B ! over tree>> push ! before-head-mode >>insertion-mode "handle html in-body-mode here" throw unimplemented* ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ unimplemented* ] } { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [ over tree>> push f >>frameset-ok? in-body-mode >>insertion-mode ] } { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [ unimplemented* ] } { [ dup { [ tag? ] [ name>> { "base" "basefont" "bgsound" "link" "meta" "noframes" "script" "style" "template" "title" } member? ] } 1&& ] [ unimplemented* ] } { [ dup { [ end-tag? ] [ name>> "template" = ] } 1&& ] [ unimplemented* ] } ! same as default case ! { [ dup { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ] [ ! unimplemented* ! ] } { [ dup { [ { [ tag? ] [ name>> "head" = ] } 1&& ] [ { [ end-tag? ] [ name>> { "body" "html" "br" } member? not ] } 1&& ] } 1|| ] [ "omg revisit this" throw unimplemented* ] } [ B "body" >>name pick tree>> push in-body-mode >>insertion-mode tree-insert ] } cond ; M: in-body-mode tree-insert* drop { { [ dup CHAR: \0 = ] [ drop ] } { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] } { [ dup comment? ] [ over tree>> push ] } { [ dup doctype? ] [ drop ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ drop ] } { [ dup { [ { [ tag? ] [ name>> { "base" "basefont" "bgsound" "link" "meta" "noframes" "script" "style" "template" "title" } member? ] } 1&& ] [ { [ end-tag? ] [ name>> "template" = ] } 1&& ] } 1|| ] [ unimplemented* ] } ! XXX: parse error { [ dup { [ tag? ] [ name>> "body" = ] } 1&& ] [ drop unimplemented* ] } { [ dup { [ tag? ] [ name>> "frameset" = ] } 1&& ] [ drop unimplemented* ] } ! XXX: eof ! { [ ] [ ] } { [ dup { [ end-tag? ] [ name>> "body" = ] } 1&& ] [ "body" pick tree>> find-matching-tag unclip swap >>children swap >>end-tag over tree>> push after-body-mode >>insertion-mode ] } { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [ drop unimplemented* ] } ! { [ ] [ ] } [ unimplemented* ] } cond ; M: text-mode tree-insert* drop unimplemented* ; M: in-table-mode tree-insert* drop unimplemented* ; M: in-table-text-mode tree-insert* drop unimplemented* ; M: in-caption-mode tree-insert* drop unimplemented* ; M: in-column-group-mode tree-insert* drop unimplemented* ; M: in-table-body-mode tree-insert* drop unimplemented* ; M: in-row-mode tree-insert* drop unimplemented* ; M: in-cell-mode tree-insert* drop unimplemented* ; M: in-select-mode tree-insert* drop unimplemented* ; M: in-select-in-table-mode tree-insert* drop unimplemented* ; M: in-template-mode tree-insert* drop unimplemented* ; M: after-body-mode tree-insert* drop { { [ dup "\t\n\f\r\s" member? ] [ over tree>> push ] } { [ dup comment? ] [ over tree>> push ] } { [ dup doctype? ] [ drop ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ unimplemented* ] } { [ dup { [ end-tag? ] [ name>> "html" = ] } 1&& ] [ ! XXX: make this a function "html" pick tree>> find-matching-tag unclip swap >>children swap >>end-tag over tree>> push after-after-body-mode >>insertion-mode ] } [ unimplemented* ] } cond ; M: in-frameset-mode tree-insert* drop unimplemented* ; M: after-frameset-mode tree-insert* drop unimplemented* ; M: after-after-body-mode tree-insert* drop { { [ dup comment? ] [ over tree>> push ] } { [ dup doctype? ] [ unimplemented* ] } { [ dup "\t\n\f\r\s" member? ] [ unimplemented* ] } { [ dup { [ tag? ] [ name>> "html" = ] } 1&& ] [ unimplemented* ] } ! eof { [ dup f = ] [ drop ] } [ ! XXX: parse error [ in-body-mode >>insertion-mode ] dip tree-insert ] } cond ; M: after-after-frameset-mode tree-insert* drop unimplemented* ; : tree-insert ( document obj -- document ) over insertion-mode>> tree-insert* ; MEMO: load-entities ( -- assoc ) "vocab:html5/entities.json" utf8 file-contents json> ; MEMO: entities-suffix-array ( -- assoc ) load-entities keys >suffix-array ; : lookup-entity ( string -- entity/string ? ) load-entities ?at ; : named-character-match? ( document -- prefix? exact? ) temporary-buffer>> [ entities-suffix-array query f like ] [ last CHAR: ; = ] bi ; ERROR: unknown-named-entity entity ; : take-named-character ( document -- ) dup temporary-buffer>> >string lookup-entity [ "characters" of SBUF" " clone-like >>temporary-buffer drop ] [ unknown-named-entity ] if ; ! XXX: remove the tag>> name>> push part : push-tag-name ( ch document -- ) [ tag>> name>> push ] [ 2drop ! tag-name>> push ] 2bi ; : push-attribute-name ( ch document -- ) attribute-name>> push ; : push-attribute-value ( ch document -- ) attribute-value>> push ; : push-comment-token ( ch document -- ) comment-token>> push ; : push-all-comment-token ( string document -- ) comment-token>> push-all ; ERROR: invalid-return-state obj ; : check-return-state ( obj -- return-state ) dup word? [ invalid-return-state ] unless ; : current-attribute ( document -- attribute/f ) [ attribute-name>> >string f like ] [ attribute-value>> >string f like ] bi 2dup or [ 2array ] [ 2drop f ] if ; : push-when ( obj/f seq -- ) over [ push ] [ 2drop ] if ; inline : reset-attribute ( document -- ) SBUF" " clone >>attribute-name SBUF" " clone >>attribute-value drop ; : push-attribute ( document -- ) [ current-attribute ] [ tag>> attributes>> push-when ] [ reset-attribute ] tri ; : emit-eof ( document -- ) "emit-eof" print f tree-insert drop ; : emit-char ( char document -- ) drop "emit-char: " write 1string . ; : emit-string ( char document -- ) drop "emit-string: " write . ; : emit-tag ( document -- ) "emit-tag: " write { [ tag>> [ name>> >string ] [ name<< ] bi ] [ push-attribute ] [ tag>> . ] [ dup tag>> tree-insert drop ] [ f >>tag drop ] } cleave ; : emit-end-tag ( document -- ) "emit-end-tag: " write [ tag>> . ] [ f >>tag drop ] bi ; : emit-comment-token ( document -- ) "emit-comment-token: " write { [ comment-token>> >string . ] [ dup comment-token>> >string tree-insert drop ] [ SBUF" " clone >>comment-token drop ] } cleave ; : emit-doctype ( document -- ) "emit-doctype: " write dup doctype>> . { [ doctype>> [ >string ] change-name drop ] [ ! XXX: handle iframe srcdoc document dup { [ doctype>> name>> "html" = not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [ t >>quirks-mode? ] [ dup { [ iframe-srcdoc?>> not ] [ parser-cannot-change-mode-flag>> not ] } 1&& [ dup doctype>> limited-quirks-mode? [ t >>limited-quirks-mode? ] when ] when ] if drop ] [ dup doctype>> tree-insert drop ] [ f >>doctype drop ] } cleave ; : reset-temporary-buffer ( document -- ) SBUF" " clone temporary-buffer<< ; : ch>new-temporary-buffer ( ch document -- ) [ 1sbuf ] dip temporary-buffer<< ; : string>new-temporary-buffer ( string document -- ) [ SBUF" " clone-like ] dip temporary-buffer<< ; : temporary-buffer-last ( document -- ch/f ) temporary-buffer>> ?last ; : push-temporary-buffer ( ch document -- ) temporary-buffer>> push ; : push-all-temporary-buffer ( string document -- ) temporary-buffer>> push-all ; : flush-temporary-buffer ( document -- ) "flush-temporary-buffer: " write [ [ temporary-buffer>> ] keep [ emit-char ] curry each ] [ SBUF" " clone >>temporary-buffer drop ] bi ; : emit-temporary-buffer-with ( string document -- ) [ temporary-buffer>> push-all ] [ flush-temporary-buffer ] bi ; ! check if matches open tag : appropriate-end-tag-token? ( document -- ? ) drop f ; : ascii-upper-alpha? ( ch -- ? ) [ CHAR: A CHAR: Z between? ] [ f ] if* ; inline : ascii-lower-alpha? ( ch -- ? ) [ CHAR: a CHAR: z between? ] [ f ] if* ; inline : ascii-upper-hex-digit? ( ch -- ? ) [ CHAR: A CHAR: F between? ] [ f ] if* ; inline : ascii-lower-hex-digit? ( ch -- ? ) [ CHAR: a CHAR: f between? ] [ f ] if* ; inline : ascii-hex-alpha? ( ch -- ? ) { [ ascii-upper-hex-digit? ] [ ascii-lower-hex-digit? ] } 1|| ; inline : ascii-digit? ( ch/f -- ? ) [ CHAR: 0 CHAR: 9 between? ] [ f ] if* ; : ascii-alpha? ( ch/f -- ? ) { [ ascii-lower-alpha? ] [ ascii-upper-alpha? ] } 1|| ; : ascii-alphanumeric? ( ch/f -- ? ) { [ ascii-alpha? ] [ ascii-digit? ] } 1|| ; : ascii-hex-digit? ( ch/f -- ? ) { [ ascii-digit? ] [ ascii-hex-alpha? ] } 1|| ; : (return-state) ( document n/f string ch/f -- document n'/f string ) reach [ f ] change-return-state drop check-return-state execute( document n/f string ch/f -- document n'/f string ) ; : return-state ( document n/f string -- document n'/f string ) pick [ f ] change-return-state drop check-return-state execute( document n/f string -- document n'/f string ) ; : (data-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: & = ] [ drop [ \ data-state >>return-state ] 2dip character-reference-state ] } { [ dup CHAR: < = ] [ drop tag-open-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character ] } { [ dup f = ] [ drop pick emit-eof ] } [ reach emit-char data-state ] } cond ; : data-state ( document n/f string -- document n'/f string ) take-char (data-state) ; : (rcdata-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: & = ] [ drop [ \ rcdata-state >>return-state ] 2dip character-reference-state ] } { [ dup CHAR: < = ] [ drop rcdata-less-than-sign-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character ] } { [ dup f = ] [ drop pick emit-eof ] } [ reach emit-char rcdata-state ] } cond ; : rcdata-state ( document n/f string -- document n'/f string ) take-char (rcdata-state) ; : (rawtext-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: < = ] [ drop rawtext-less-than-sign-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] } { [ dup f = ] [ drop pick emit-eof ] } [ reach emit-char rawtext-state ] } cond ; : rawtext-state ( document n/f string -- document n'/f string ) take-char (rawtext-state) ; : (script-data-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: < = ] [ drop script-data-less-than-sign-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] } { [ dup f = ] [ drop pick emit-eof ] } [ reach emit-char script-data-state ] } cond ; : script-data-state ( document n/f string -- document n'/f string ) take-char (script-data-state) ; : (plaintext-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] } { [ dup f = ] [ drop pick emit-eof ] } [ reach emit-char plaintext-state ] } cond ; : plaintext-state ( document n/f string -- document n'/f string ) take-char (plaintext-state) ; : (tag-open-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-alpha? ] [ reach new-tag (tag-name-state) ] } { [ dup CHAR: ! = ] [ drop markup-declaration-open-state ] } { [ dup CHAR: / = ] [ drop end-tag-open-state ] } { [ dup CHAR: ? = ] [ unexpected-question-mark-instead-of-tag-name ] } { [ dup f = ] [ eof-before-tag-name ] } [ invalid-first-character-of-tag-name ] } cond ; : tag-open-state ( document n/f string -- document n'/f string ) take-char (tag-open-state) ; : (end-tag-open-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-alpha? ] [ reach new-end-tag (tag-name-state) ] } { [ dup CHAR: > = ] [ missing-end-tag-name ] } { [ dup f = ] [ eof-before-tag-name ] } [ invalid-first-character-of-tag-name ] } cond ; : end-tag-open-state ( document n/f string -- document n'/f string ) take-char (end-tag-open-state) ; : (tag-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-tag-name tag-name-state ] } { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] } { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] } { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character ] } { [ dup f = ] [ eof-before-tag-name ] } [ reach push-tag-name tag-name-state ] } cond ; : tag-name-state ( document n/f string -- document n'/f string ) take-char (tag-name-state) ; : (rcdata-less-than-sign-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer rcdata-end-tag-open-state ] } [ [ CHAR: < reach emit-char ] dip (rcdata-state) ] } cond ; : rcdata-less-than-sign-state ( document n/f string -- document n'/f string ) take-char (rcdata-less-than-sign-state) ; : (rcdata-end-tag-open-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-alpha? ] [ reach new-end-tag (rcdata-end-tag-name-state) ] } [ [ CHAR: < reach emit-char ] dip (rcdata-state) ] } cond ; : rcdata-end-tag-open-state ( document n/f string -- document n'/f string ) take-char (rcdata-end-tag-open-state) ; : (rcdata-end-tag-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop pick appropriate-end-tag-token? [ before-attribute-name-state ] [ " = ] [ drop pick appropriate-end-tag-token? [ pick emit-end-tag data-state ] [ " = ] [ drop pick appropriate-end-tag-token? [ pick emit-end-tag data-state ] [ " = ] [ drop pick appropriate-end-tag-token? [ pick emit-end-tag data-state ] [ " = ] [ reach emit-char script-data-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character script-data-escaped-state ] } { [ dup f = ] [ eof-in-script-html-comment-like-text ] } [ reach emit-char script-data-escaped-state ] } cond ; : script-data-escaped-dash-dash-state ( document n/f string -- document n'/f string ) take-char (script-data-escaped-dash-dash-state) ; : (script-data-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: / = ] [ drop pick reset-temporary-buffer script-data-escaped-end-tag-open-state ] } { [ dup ascii-alpha? ] [ [ pick reset-temporary-buffer CHAR: < reach emit-char ] dip (script-data-double-escape-start-state) ] } [ [ CHAR: < reach emit-char ] dip (script-data-escaped-state) ] } cond ; : script-data-escaped-less-than-sign-state ( document n/f string -- document n'/f string ) take-char (script-data-escaped-less-than-sign-state) ; : (script-data-escaped-end-tag-open-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-alpha? ] [ [ pick new-end-tag ] dip (script-data-escaped-end-tag-name-state) ] } [ [ " = ] [ drop pick appropriate-end-tag-token? [ pick emit-end-tag data-state ] [ "" member? ] [ reach emit-char pick temporary-buffer>> "script" sequence= [ script-data-double-escaped-state ] [ script-data-escaped-state ] if ] } { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] } { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-start-state ] } ! todo [ (script-data-escaped-state) ] } cond ; : script-data-double-escape-start-state ( document n/f string -- document n'/f string ) take-char (script-data-double-escape-start-state) ; : (script-data-double-escaped-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-state ] } { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character CHAR: replacement-character reach emit-char script-data-double-escaped-state ] } { [ dup f = ] [ eof-in-script-html-comment-like-text ] } [ reach emit-char script-data-double-escaped-state ] } cond ; : script-data-double-escaped-state ( document n/f string -- document n'/f string ) take-char (script-data-double-escaped-state) ; : (script-data-double-escaped-dash-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] } { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character CHAR: replacement-character reach emit-char script-data-double-escaped-state ] } { [ dup f = ] [ eof-in-script-html-comment-like-text ] } [ reach emit-char script-data-double-escaped-state ] } cond ; : script-data-double-escaped-dash-state ( document n/f string -- document n'/f string ) take-char (script-data-double-escaped-dash-state) ; : (script-data-double-escaped-dash-dash-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ reach emit-char script-data-double-escaped-dash-dash-state ] } { [ dup CHAR: < = ] [ reach emit-char script-data-double-escaped-less-than-sign-state ] } { [ dup CHAR: > = ] [ reach emit-char script-data-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character CHAR: replacement-character reach emit-char script-data-double-escaped-state ] } { [ dup f = ] [ eof-in-script-html-comment-like-text ] } [ reach emit-char script-data-escaped-state ] } cond ; : script-data-double-escaped-dash-dash-state ( document n/f string -- document n'/f string ) take-char (script-data-double-escaped-dash-dash-state) ; : (script-data-double-escaped-less-than-sign-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: / = ] [ reach emit-char pick reset-temporary-buffer script-data-double-escape-end-state ] } [ (script-data-double-escaped-state) ] } cond ; : script-data-double-escaped-less-than-sign-state ( document n/f string -- document n'/f string ) take-char (script-data-double-escaped-less-than-sign-state) ; : (script-data-double-escape-end-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s/>" member? ] [ reach emit-char pick temporary-buffer>> "script" sequence= [ script-data-escaped-state ] [ script-data-double-escaped-state ] if ] } { [ dup ascii-upper-alpha? ] [ [ 0x20 + reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] } { [ dup ascii-lower-alpha? ] [ [ reach push-tag-name ] [ reach push-temporary-buffer ] bi script-data-double-escape-end-state ] } ! todo [ (script-data-double-escaped-state) ] } cond ; : script-data-double-escape-end-state ( document n/f string -- document n'/f string ) take-char (script-data-double-escape-end-state) ; : (before-attribute-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] } { [ dup "/>" member? ] [ (after-attribute-name-state) ] } { [ dup f = ] [ (after-attribute-name-state) ] } { [ dup CHAR: = = ] [ unexpected-equals-sign-before-attribute-name ] } [ reach push-attribute (attribute-name-state) ] } cond ; : before-attribute-name-state ( document n/f string -- document n'/f string ) take-char (before-attribute-name-state) ; : (attribute-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s/>" member? ] [ (after-attribute-name-state) ] } { [ dup f = ] [ (after-attribute-name-state) ] } { [ dup CHAR: = = ] [ drop before-attribute-value-state ] } { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-attribute-name attribute-name-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character ] } { [ dup "\"'<" member? ] [ unexpected-character-in-attribute-name reach push-attribute-name attribute-name-state ] } [ reach push-attribute-name attribute-name-state ] } cond ; : attribute-name-state ( document n/f string -- document n'/f string ) take-char (attribute-name-state) ; : (after-attribute-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop after-attribute-name-state ] } { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] } { [ dup CHAR: = = ] [ drop before-attribute-value-state ] } { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] } { [ dup f = ] [ eof-in-tag ] } [ [ pick push-attribute ] dip (attribute-name-state) ] } cond ; : after-attribute-name-state ( document n/f string -- document n'/f string ) take-char (after-attribute-name-state) ; : (before-attribute-value-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] } { [ dup CHAR: " = ] [ drop attribute-value-double-quoted-state ] } { [ dup CHAR: ' = ] [ drop attribute-value-single-quoted-state ] } { [ dup CHAR: > = ] [ drop missing-attribute-value ] } [ (attribute-value-unquoted-state) ] } cond ; : before-attribute-value-state ( document n/f string -- document n'/f string ) take-char (before-attribute-value-state) ; : (attribute-value-double-quoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: " = ] [ drop after-attribute-value-quoted-state ] } { [ dup CHAR: & = ] [ drop [ \ attribute-value-double-quoted-state >>return-state ] 2dip character-reference-state ] } { [ dup CHAR: \0 = ] [ unexpected-null-character ] } { [ dup f = ] [ eof-in-tag ] } [ reach push-attribute-value attribute-value-double-quoted-state ] } cond ; : attribute-value-double-quoted-state ( document n/f string -- document n'/f string ) take-char (attribute-value-double-quoted-state) ; : (attribute-value-single-quoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: ' = ] [ drop after-attribute-value-quoted-state ] } { [ dup CHAR: & = ] [ drop [ \ attribute-value-single-quoted-state >>return-state ] 2dip character-reference-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-attribute-value ] } { [ dup f = ] [ eof-in-tag ] } [ reach push-attribute-value attribute-value-single-quoted-state ] } cond ; : attribute-value-single-quoted-state ( document n/f string -- document n'/f string ) take-char (attribute-value-single-quoted-state) ; : (attribute-value-unquoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] } { [ dup CHAR: & = ] [ drop [ \ attribute-value-unquoted-state >>return-state ] 2dip character-reference-state ] } { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-attribute-value ] } { [ dup "\"'<=`" member? ] [ unexpected-character-in-unquoted-attribute-value reach push-attribute-value attribute-value-unquoted-state ] } { [ dup f = ] [ eof-in-tag ] } [ reach push-attribute-value attribute-value-unquoted-state ] } cond ; : attribute-value-unquoted-state ( document n/f string -- document n'/f string ) take-char (attribute-value-unquoted-state) ; : (after-attribute-value-quoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-attribute-name-state ] } { [ dup CHAR: / = ] [ drop self-closing-start-tag-state ] } { [ dup CHAR: > = ] [ drop pick emit-tag data-state ] } { [ dup f = ] [ eof-in-tag ] } [ missing-whitespace-between-attributes (before-attribute-name-state) ] } cond ; : after-attribute-value-quoted-state ( document n/f string -- document n'/f string ) take-char (after-attribute-value-quoted-state) ; : (self-closing-start-tag-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: > = ] [ drop pick [ set-self-closing ] [ emit-tag ] bi data-state ] } { [ dup f = ] [ eof-in-tag ] } [ unexpected-solidus-in-tag ] } cond ; : self-closing-start-tag-state ( document n/f string -- document n'/f string ) take-char (self-closing-start-tag-state) ; : (bogus-comment-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] } { [ dup f = ] [ drop pick [ emit-comment-token ] [ emit-eof ] bi ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach push-comment-token ] } [ reach push-comment-token bogus-comment-state ] } cond ; : bogus-comment-state ( document n/f string -- document n'/f string ) take-char (bogus-comment-state) ; : markup-declaration-open-state ( document n/f string -- document n'/f string ) { { [ "--" take-from? ] [ comment-start-state ] } { [ "DOCTYPE" take-from-insensitive? ] [ doctype-state ] } { [ "[CDATA[" take-from-insensitive? ] [ unimplemented* ] } [ incorrectly-opened-comment ! bogus-comment-state ] } cond ; : (comment-start-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ drop comment-start-dash-state ] } { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment pick emit-comment-token data-state ] } [ (comment-state) ] } cond ; : comment-start-state ( document n/f string -- document n'/f string ) take-char (comment-start-state) ; : (comment-start-dash-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ drop comment-end-state ] } { [ dup CHAR: > = ] [ drop abrupt-closing-of-empty-comment ] } { [ dup f = ] [ eof-in-comment ] } [ [ CHAR: - reach push-comment-token ] dip (comment-state) ] } cond ; : comment-start-dash-state ( document n/f string -- document n'/f string ) take-char (comment-start-dash-state) ; : (comment-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] } { [ dup CHAR: - = ] [ drop comment-end-dash-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character ] } { [ dup f = ] [ eof-in-comment ] } [ reach push-comment-token comment-state ] } cond ; : comment-state ( document n/f string -- document n'/f string ) take-char (comment-state) ; : (comment-less-than-sign-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: ! = ] [ reach push-comment-token comment-less-than-sign-bang-state ] } { [ dup CHAR: < = ] [ reach push-comment-token comment-less-than-sign-state ] } [ (comment-state) ] } cond ; : comment-less-than-sign-state ( document n/f string -- document n'/f string ) take-char (comment-less-than-sign-state) ; : (comment-less-than-sign-bang-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ reach push-comment-token comment-less-than-sign-bang-dash-state ] } [ (comment-state) ] } cond ; : comment-less-than-sign-bang-state ( document n/f string -- document n'/f string ) take-char (comment-less-than-sign-bang-state) ; : (comment-less-than-sign-bang-dash-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ drop comment-less-than-sign-bang-dash-dash-state ] } [ (comment-end-dash-state) ] } cond ; : comment-less-than-sign-bang-dash-state ( document n/f string -- document n'/f string ) take-char (comment-less-than-sign-bang-dash-state) ; : (comment-less-than-sign-bang-dash-dash-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: > = ] [ (comment-end-state) ] } { [ dup f = ] [ (comment-end-state) ] } [ nested-comment (comment-end-state) ] } cond ; : comment-less-than-sign-bang-dash-dash-state ( document n/f string -- document n'/f string ) take-char (comment-less-than-sign-bang-dash-dash-state) ; : (comment-end-dash-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ drop comment-end-state ] } { [ dup f = ] [ eof-in-comment ] } [ [ CHAR: - reach push-comment-token ] dip (comment-state) ] } cond ; : comment-end-dash-state ( document n/f string -- document n'/f string ) take-char (comment-end-dash-state) ; : (comment-end-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: > = ] [ drop pick emit-comment-token data-state ] } { [ dup CHAR: ! = ] [ drop comment-end-bang-state ] } { [ dup CHAR: - = ] [ reach push-comment-token comment-end-state ] } { [ dup f = ] [ drop eof-in-comment pick [ emit-comment-token ] [ emit-eof ] bi ] } [ [ "--" reach push-all-comment-token ] dip (comment-state) ] } cond ; : comment-end-state ( document n/f string -- document n'/f string ) take-char (comment-end-state) ; : (comment-end-bang-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: - = ] [ drop comment-end-dash-state ] } { [ dup CHAR: > = ] [ drop incorrectly-closed-comment pick emit-comment-token data-state ] } { [ dup f = ] [ eof-in-comment ] } [ [ "--!" reach push-all-comment-token ] dip (comment-state) ] } cond ; : comment-end-bang-state ( document n/f string -- document n'/f string ) take-char (comment-end-bang-state) ; : (doctype-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] } { [ dup CHAR: > = ] [ (before-doctype-name-state) ] } { [ dup f = ] [ drop eof-in-doctype pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ missing-whitespace-before-doctype-name ] } cond ; : doctype-state ( document n/f string -- document n'/f string ) take-char (doctype-state) ; : (before-doctype-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-doctype-name-state ] } { [ dup ascii-upper-alpha? ] [ 0x20 + reach new-doctype-from-ch doctype-name-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character reach new-doctype-from-ch doctype-name-state ] } { [ dup CHAR: > = ] [ drop missing-doctype-name pick [ new-doctype-with-quirks ] [ emit-doctype ] bi ] } { [ dup f = ] [ drop eof-in-doctype pick [ new-doctype-with-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ reach new-doctype-from-ch doctype-name-state ] } cond ; : before-doctype-name-state ( document n/f string -- document n'/f string ) take-char (before-doctype-name-state) ; : (doctype-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] } { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] } { [ dup ascii-upper-alpha? ] [ 0x20 + reach push-doctype-name doctype-name-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character pick push-doctype-name doctype-name-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } ! force-quirks on for doctype [ reach push-doctype-name doctype-name-state ] } cond ; : doctype-name-state ( document n/f string -- document n'/f string ) take-char (doctype-name-state) ; : (after-doctype-name-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop after-doctype-name-state ] } { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] } { [ dup f = ] [ eof-in-doctype ] } { [ [ "PUBLIC" take-from-insensitive? ] dip swap ] [ drop after-doctype-public-keyword-state ] } { [ [ "SYSTEM" take-from-insensitive? ] dip swap ] [ drop after-doctype-system-keyword-state ] } [ invalid-character-sequence-after-doctype-name ] } cond ; : after-doctype-name-state ( document n/f string -- document n'/f string ) take-char (after-doctype-name-state) ; : (after-doctype-public-keyword-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] } { [ dup CHAR: " = ] [ missing-whitespace-after-doctype-public-keyword ] } { [ dup CHAR: ' = ] [ missing-whitespace-after-doctype-public-keyword ] } { [ dup CHAR: > = ] [ drop missing-doctype-public-identifier force-quirks data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } [ missing-quote-before-doctype-public-identifier [ reach force-quirks ] dip (bogus-doctype-state) ] } cond ; : after-doctype-public-keyword-state ( document n/f string -- document n'/f string ) take-char (after-doctype-public-keyword-state) ; : (before-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-doctype-public-identifier-state ] } { [ dup CHAR: " = ] [ drop pick initialize-doctype-public-identifier doctype-public-identifier-double-quoted-state ] } { [ dup CHAR: ' = ] [ drop pick initialize-doctype-public-identifier doctype-public-identifier-single-quoted-state ] } { [ dup CHAR: > = ] [ drop missing-doctype-public-identifier pick [ force-quirks ] [ emit-doctype ] bi data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } [ missing-quote-before-doctype-public-identifier [ reach force-quirks ] dip (bogus-doctype-state) ] } cond ; : before-doctype-public-identifier-state ( document n/f string -- document n'/f string ) take-char (before-doctype-public-identifier-state) ; : (doctype-public-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: " = ] [ drop after-doctype-public-identifier-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character pick push-doctype-public-identifier doctype-public-identifier-double-quoted-state ] } { [ dup CHAR: > = ] [ drop abrupt-doctype-public-identifier pick [ force-quirks ] [ emit-doctype ] bi data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ reach push-doctype-public-identifier doctype-public-identifier-double-quoted-state ] } cond ; : doctype-public-identifier-double-quoted-state ( document n/f string -- document n'/f string ) take-char (doctype-public-identifier-double-quoted-state) ; : (doctype-public-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: ' = ] [ drop after-doctype-public-identifier-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character pick push-doctype-public-identifier doctype-public-identifier-double-quoted-state ] } { [ dup CHAR: > = ] [ drop abrupt-doctype-public-identifier pick [ force-quirks ] [ emit-doctype ] bi data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ reach push-doctype-public-identifier doctype-public-identifier-single-quoted-state ] } cond ; : doctype-public-identifier-single-quoted-state ( document n/f string -- document n'/f string ) take-char (doctype-public-identifier-single-quoted-state) ; : (after-doctype-public-identifier-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] } { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] } { [ dup CHAR: " = ] [ drop missing-whitespace-between-doctype-public-and-system-identifiers pick initialize-doctype-system-identifier doctype-system-identifier-double-quoted-state ] } { [ dup CHAR: ' = ] [ drop missing-whitespace-between-doctype-public-and-system-identifiers pick initialize-doctype-system-identifier doctype-system-identifier-single-quoted-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ missing-quote-before-doctype-system-identifier [ reach force-quirks ] dip (bogus-doctype-state) ] } cond ; : after-doctype-public-identifier-state ( document n/f string -- document n'/f string ) take-char (after-doctype-public-identifier-state) ; : (between-doctype-public-and-system-identifiers-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] } { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] } { [ dup CHAR: " = ] [ drop pick initialize-doctype-system-identifier doctype-system-identifier-double-quoted-state ] } { [ dup CHAR: ' = ] [ drop pick initialize-doctype-system-identifier doctype-system-identifier-single-quoted-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ missing-quote-before-doctype-system-identifier [ reach force-quirks ] dip (bogus-doctype-state) ] } cond ; : between-doctype-public-and-system-identifiers-state ( document n/f string -- document n'/f string ) take-char (between-doctype-public-and-system-identifiers-state) ; : (after-doctype-system-keyword-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop between-doctype-public-and-system-identifiers-state ] } { [ dup CHAR: " = ] [ drop missing-whitespace-after-doctype-system-keyword pick initialize-doctype-system-identifier doctype-system-identifier-double-quoted-state ] } { [ dup CHAR: ' = ] [ drop missing-whitespace-after-doctype-system-keyword pick initialize-doctype-system-identifier doctype-system-identifier-single-quoted-state ] } { [ dup CHAR: > = ] [ drop missing-doctype-system-identifier pick [ force-quirks ] [ emit-doctype ] bi data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ missing-quote-before-doctype-system-identifier [ reach force-quirks ] dip (bogus-doctype-state) ] } cond ; : after-doctype-system-keyword-state ( document n/f string -- document n'/f string ) take-char (after-doctype-system-keyword-state) ; : (before-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop before-doctype-system-identifier-state ] } { [ dup CHAR: " = ] [ drop pick initialize-doctype-system-identifier doctype-system-identifier-double-quoted-state ] } { [ dup CHAR: ' = ] [ drop pick initialize-doctype-system-identifier doctype-system-identifier-single-quoted-state ] } { [ dup CHAR: > = ] [ drop missing-doctype-system-identifier pick [ force-quirks ] [ emit-doctype ] bi data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ emit-doctype ] [ emit-eof ] bi ] } [ missing-quote-before-doctype-system-identifier [ reach force-quirks ] dip (bogus-doctype-state) ] } cond ; : before-doctype-system-identifier-state ( document n/f string -- document n'/f string ) take-char (before-doctype-system-identifier-state) ; : (doctype-system-identifier-double-quoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: " = ] [ drop after-doctype-system-identifier-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character pick push-doctype-system-identifier doctype-system-identifier-double-quoted-state ] } { [ dup CHAR: > = ] [ drop abrupt-doctype-system-identifier pick [ force-quirks ] [ emit-doctype ] bi data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ reach push-doctype-system-identifier doctype-system-identifier-double-quoted-state ] } cond ; : doctype-system-identifier-double-quoted-state ( document n/f string -- document n'/f string ) take-char (doctype-system-identifier-double-quoted-state) ; : (doctype-system-identifier-single-quoted-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: ' = ] [ drop after-doctype-system-identifier-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character CHAR: replacement-character pick push-doctype-system-identifier doctype-system-identifier-double-quoted-state ] } { [ dup CHAR: > = ] [ drop abrupt-doctype-system-identifier pick [ force-quirks ] [ emit-doctype ] bi data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ reach push-doctype-system-identifier doctype-system-identifier-single-quoted-state ] } cond ; : doctype-system-identifier-single-quoted-state ( document n/f string -- document n'/f string ) take-char (doctype-system-identifier-single-quoted-state) ; : (after-doctype-system-identifier-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "\t\n\f\s" member? ] [ drop after-doctype-system-identifier-state ] } { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] } { [ dup f = ] [ drop eof-in-doctype pick [ force-quirks ] [ emit-doctype ] [ emit-eof ] tri ] } [ unexpected-character-after-doctype-system-identifier [ reach force-quirks ] dip (bogus-doctype-state) ] } cond ; : after-doctype-system-identifier-state ( document n/f string -- document n'/f string ) take-char (after-doctype-system-identifier-state) ; : (bogus-doctype-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: > = ] [ drop pick emit-doctype data-state ] } { [ dup CHAR: \0 = ] [ drop unexpected-null-character bogus-doctype-state ] } { [ dup f = ] [ drop eof-in-doctype pick emit-eof ] } [ drop bogus-doctype-state ] } cond ; : bogus-doctype-state ( document n/f string -- document n'/f string ) take-char (bogus-doctype-state) ; : (cdata-section-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: ] = ] [ drop cdata-section-bracket-state ] } { [ dup f = ] [ drop eof-in-cdata pick emit-eof ] } [ reach emit-char cdata-section-state ] } cond ; : cdata-section-state ( document n/f string -- document n'/f string ) take-char (cdata-section-state) ; : (cdata-section-bracket-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: ] = ] [ drop cdata-section-end-state ] } [ [ CHAR: ] reach emit-char ] dip (cdata-section-state) ] } cond ; : cdata-section-bracket-state ( document n/f string -- document n'/f string ) take-char (cdata-section-bracket-state) ; : (cdata-section-end-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup CHAR: ] = ] [ reach emit-char cdata-section-end-state ] } { [ dup CHAR: > = ] [ drop data-state ] } [ [ "]]" reach emit-string ] dip (cdata-section-state) ] } cond ; : cdata-section-end-state ( document n/f string -- document n'/f string ) take-char (cdata-section-end-state) ; : (character-reference-state) ( document n/f string ch/f -- document n'/f string ) [ CHAR: & reach ch>new-temporary-buffer ] dip { { [ dup ascii-alphanumeric? ] [ (named-character-reference-state) ] } { [ dup CHAR: # = ] [ reach push-temporary-buffer numeric-character-reference-state ] } [ reach flush-temporary-buffer (return-state) ] } cond ; : character-reference-state ( document n/f string -- document n'/f string ) take-char (character-reference-state) ; : (named-character-reference-state) ( document n/f string ch/f -- document n'/f string ) reach push-temporary-buffer pick named-character-match? [ drop ! exact match, drop prefix match ! XXX: check me { [ pick temporary-buffer-attribute? ] [ pick temporary-buffer>> ?last CHAR: ; = not ] [ 3dup peek-from { [ CHAR: = = ] [ ascii-alphanumeric? ] } 1|| ] } 0&& [ unimplemented* flush-temporary-buffer return-state ] [ pick [ take-named-character ] [ flush-temporary-buffer ] bi return-state ] if ] [ ! prefix match? [ named-character-reference-state ] [ pick flush-temporary-buffer ambiguous-ampersand-state ] if ] if ; : named-character-reference-state ( document n/f string -- document n'/f string ) take-char (named-character-reference-state) ; : (ambiguous-ampersand-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-alphanumeric? ] [ unimplemented* ] } { [ dup CHAR: ; = ] [ unknown-named-character-reference (return-state) ] } [ (return-state) ] } cond ; : ambiguous-ampersand-state ( document n/f string -- document n'/f string ) take-char (ambiguous-ampersand-state) ; : (numeric-character-reference-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup "xX" member? ] [ reach push-temporary-buffer hexadecimal-character-reference-start-state ] } [ (decimal-character-reference-start-state) ] } cond ; : numeric-character-reference-state ( document n/f string -- document n'/f string ) take-char (numeric-character-reference-state) ; : (hexadecimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-hex-digit? ] [ (hexadecimal-character-reference-state) ] } [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ] } cond ; : hexadecimal-character-reference-start-state ( document n/f string -- document n'/f string ) take-char (hexadecimal-character-reference-start-state) ; : (decimal-character-reference-start-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-digit? ] [ (decimal-character-reference-state) ] } [ absence-of-digits-in-numeric-character-reference reach flush-temporary-buffer (return-state) ] } cond ; : decimal-character-reference-start-state ( document n/f string -- document n'/f string ) take-char (decimal-character-reference-start-state) ; : (hexadecimal-character-reference-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-digit? ] [ unimplemented* ] } { [ dup ascii-upper-hex-digit? ] [ unimplemented* ] } { [ dup ascii-lower-hex-digit? ] [ unimplemented* ] } { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] } [ missing-semicolon-after-character-reference ] } cond ; : hexadecimal-character-reference-state ( document n/f string -- document n'/f string ) take-char (hexadecimal-character-reference-state) ; : (decimal-character-reference-state) ( document n/f string ch/f -- document n'/f string ) { { [ dup ascii-digit? ] [ unimplemented* ] } { [ dup CHAR: ; = ] [ drop numeric-character-reference-end-state ] } [ missing-semicolon-after-character-reference ] } cond ; : decimal-character-reference-state ( document n/f string -- document n'/f string ) take-char (decimal-character-reference-state) ; : (numeric-character-reference-end-state) ( document n/f string ch/f -- document n'/f string ) { [ missing-semicolon-after-character-reference ] } cond ; : numeric-character-reference-end-state ( document n/f string -- document n'/f string ) take-char (numeric-character-reference-end-state) ; : parse-html5 ( string -- document ) [ 0 ] dip data-state 2drop ;