! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays hashtables html.parser.state html.parser.utils kernel make namespaces sequences unicode.case unicode.categories combinators.short-circuit quoting ; IN: html.parser TUPLE: tag name attributes text closing? ; SINGLETON: text SINGLETON: dtd SINGLETON: comment SYMBOL: tagstack : push-tag ( tag -- ) tagstack get push ; : closing-tag? ( string -- ? ) [ f ] [ [ first ] [ peek ] bi [ CHAR: / = ] bi@ or ] if-empty ; : ( name attributes closing? -- tag ) tag new swap >>closing? swap >>attributes swap >>name ; : make-tag ( string attribs -- tag ) [ [ closing-tag? ] keep "/" trim1 ] dip rot ; : new-tag ( string type -- tag ) tag new swap >>name swap >>text ; inline : make-text-tag ( string -- tag ) text new-tag ; inline : make-comment-tag ( string -- tag ) comment new-tag ; inline : make-dtd-tag ( string -- tag ) dtd new-tag ; inline : read-single-quote ( state-parser -- string ) [ [ CHAR: ' = ] take-until ] [ next drop ] bi ; : read-double-quote ( state-parser -- string ) [ [ CHAR: " = ] take-until ] [ next drop ] bi ; : read-quote ( state-parser -- string ) dup get+increment CHAR: ' = [ read-single-quote ] [ read-double-quote ] if ; : read-key ( state-parser -- string ) skip-whitespace [ { [ CHAR: = = ] [ blank? ] } 1|| ] take-until ; : read-= ( state-parser -- ) skip-whitespace [ [ CHAR: = = ] take-until drop ] [ next drop ] bi ; : read-token ( state-parser -- string ) [ blank? ] take-until ; : read-value ( state-parser -- string ) skip-whitespace dup get-char quote? [ read-quote ] [ read-token ] if [ blank? ] trim ; : read-comment ( state-parser -- ) "-->" take-until-sequence make-comment-tag push-tag ; : read-dtd ( state-parser -- ) ">" take-until-sequence make-dtd-tag push-tag ; : read-bang ( state-parser -- ) next dup { [ get-char CHAR: - = ] [ get-next CHAR: - = ] } 1&& [ next next read-comment ] [ read-dtd ] if ; : read-tag ( state-parser -- string ) [ [ "><" member? ] take-until ] [ dup get-char CHAR: < = [ next ] unless drop ] bi ; : read-until-< ( state-parser -- string ) [ CHAR: < = ] take-until ; : parse-text ( state-parser -- ) read-until-< [ make-text-tag push-tag ] unless-empty ; : (parse-attributes) ( state-parser -- ) skip-whitespace dup state-parse-end? [ drop ] [ [ [ read-key >lower ] [ read-= ] [ read-value ] tri 2array , ] keep (parse-attributes) ] if ; : parse-attributes ( state-parser -- hashtable ) [ (parse-attributes) ] { } make >hashtable ; : (parse-tag) ( string -- string' hashtable ) [ [ read-token >lower ] [ parse-attributes ] bi ] state-parse ; : read-< ( state-parser -- string/f ) next dup get-char [ CHAR: ! = [ read-bang f ] [ read-tag ] if ] [ drop f ] if* ; : parse-tag ( state-parser -- ) read-< [ (parse-tag) make-tag push-tag ] unless-empty ; : (parse-html) ( state-parser -- ) dup get-next [ [ parse-text ] [ parse-tag ] [ (parse-html) ] tri ] [ drop ] if ; : tag-parse ( quot -- vector ) V{ } clone tagstack [ state-parse ] with-variable ; inline : parse-html ( string -- vector ) [ (parse-html) tagstack get ] tag-parse ;