: 10xxxxxx? ( ch -- ? )
-6 shift 3 bitand 2 = ;
-
+
: start<name ( ch -- tag )
! This is unfortunate, and exists for the corner case
! that the first letter of the document is < and second is
ascii?
[ utf8 decode-stream next make-tag ] [
next
- [ get-next 10xxxxxx? not ] take-until
+ [ drop get-next 10xxxxxx? not ] take-until
get-char suffix utf8 decode
utf8 decode-stream next
continue-make-tag
: name-char? ( 1.0? char -- ? )
swap [ 1.0name-char? ] [ 1.1name-char? ] if ;
-: text? ( 1.0? char -- ? )
+HINTS: name-start? { object fixnum } ;
+HINTS: name-char? { object fixnum } ;
+
+<PRIVATE
+
+: 1.0-text? ( char -- ? )
! 1.0:
! #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ {
+ [ HEX: 20 HEX: D7FF between? ]
+ [ "\t\r\n" member? ]
+ [ HEX: E000 HEX: FFFD between? ]
+ [ HEX: 10000 HEX: 10FFFFF between? ]
+ } 1|| ; inline
+
+: 1.1-text? ( char -- ? )
! 1.1:
! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
{
- { [ dup HEX: 20 < ] [ swap [ "\t\r\n" member? ] [ zero? not ] if ] }
- { [ nip dup HEX: D800 < ] [ drop t ] }
- { [ dup HEX: E000 < ] [ drop f ] }
- [ { HEX: FFFE HEX: FFFF } member? not ]
- } cond ;
+ [ HEX: 1 HEX: D7FF between? ]
+ [ HEX: E000 HEX: FFFD between? ]
+ [ HEX: 10000 HEX: 10FFFF between? ]
+ } 1|| ; inline
+
+PRIVATE>
+
+: text? ( 1.0? char -- ? )
+ swap [ 1.0-text? ] [ 1.1-text? ] if ;
HINTS: text? { object fixnum } ;
: take-interpolated ( quot -- interpolated )
interpolating? get [
- drop get-char CHAR: > =
+ drop get-char CHAR: > eq?
[ next f ]
[ "->" take-string [ blank? ] trim ]
if <interpolated>
: parse-attr ( -- )
parse-name pass-blank "=" expect pass-blank
- get-char CHAR: < =
+ get-char CHAR: < eq?
[ "<-" expect interpolate-quote ]
[ t parse-quote* ] if 2array , ;
: start-tag ( -- name ? )
#! Outputs the name and whether this is a closing tag
- get-char CHAR: / = dup [ next ] when
+ get-char CHAR: / eq? dup [ next ] when
parse-name swap ;
: (middle-tag) ( -- )
: middle-tag ( -- attrs-alist )
! f make will make a vector if it has any elements
[ (middle-tag) ] f make pass-blank
- assure-no-duplicates ;
+ dup length 1 > [ assure-no-duplicates ] when ;
: end-tag ( name attrs-alist -- tag )
- tag-ns pass-blank get-char CHAR: / =
+ tag-ns pass-blank get-char CHAR: / eq?
[ pop-ns <contained> next ">" expect ]
[ depth inc <opener> close ] if ;
[ take-external-id ] [ f ] if ;
: take-internal ( -- dtd/f )
- get-char CHAR: [ =
+ get-char CHAR: [ eq?
[ next take-internal-subset ] [ f ] if ;
: take-doctype-decl ( -- doctype-decl )
[ "-" bad-name ] take-interpolated ;
: make-tag ( -- tag )
- {
- { [ get-char dup CHAR: ! = ] [ drop next direct ] }
- { [ dup CHAR: ? = ] [ drop next instruct ] }
- { [ dup CHAR: - = ] [ drop next interpolate-tag ] }
+ get-char {
+ { CHAR: ! [ next direct ] }
+ { CHAR: ? [ next instruct ] }
+ { CHAR: - [ next interpolate-tag ] }
[ drop normal-tag ]
- } cond ;
+ } case ;
] ?if ;
: take-name ( -- string )
- version-1.0? '[ _ get-char name-char? not ] take-until ;
+ version-1.0? '[ _ swap name-char? not ] take-until ;
: parse-name ( -- name )
take-name interpret-name ;
} case ;
: take-word ( -- string )
- [ get-char blank? ] take-until ;
+ [ blank? ] take-until ;
: take-external-id ( -- external-id )
take-word (take-external-id) ;
USING: accessors kernel namespaces io math ;
IN: xml.state
-TUPLE: spot
- char line column next check version-1.0? stream ;
+TUPLE: spot char line column next check version-1.0? stream ;
C: <spot> spot
-: get-char ( -- char ) spot get char>> ;
-: set-char ( char -- ) spot get swap >>char drop ;
-: get-line ( -- line ) spot get line>> ;
-: set-line ( line -- ) spot get swap >>line drop ;
-: get-column ( -- column ) spot get column>> ;
-: set-column ( column -- ) spot get swap >>column drop ;
-: get-next ( -- char ) spot get next>> ;
-: set-next ( char -- ) spot get swap >>next drop ;
-: get-check ( -- ? ) spot get check>> ;
-: check ( -- ) spot get t >>check drop ;
-: version-1.0? ( -- ? ) spot get version-1.0?>> ;
+: get-char ( -- char ) spot get char>> ; inline
+: get-line ( -- line ) spot get line>> ; inline
+: get-column ( -- column ) spot get column>> ; inline
+: get-next ( -- char ) spot get next>> ; inline
+: get-check ( -- ? ) spot get check>> ; inline
+: check ( -- ) spot get t >>check drop ; inline
+: version-1.0? ( -- ? ) spot get version-1.0?>> ; inline
: set-version ( string -- )
- spot get swap "1.0" = >>version-1.0? drop ;
+ spot get swap "1.0" = >>version-1.0? drop ; inline
SYMBOL: xml-stack
-
SYMBOL: depth
-
SYMBOL: interpolating?
-
SYMBOL: in-dtd?
-
SYMBOL: pe-table
-
SYMBOL: extra-entities
[ <string-reader> ] dip with-state ; inline
: take-rest ( -- string )
- [ f ] take-until ;
+ [ drop f ] take-until ;
: take-char ( char -- string )
1string take-to ;
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
[ 2 3 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
-[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
+[ "hi" " how are you?" ] [ "hi how are you?" [ [ blank? ] take-until take-rest ] string-parse ] unit-test
[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
[ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
[ "baz" ] [ " \n\t baz" [ pass-blank take-rest ] string-parse ] unit-test
! See http://factorcode.org/license.txt for BSD license.
USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii
-circular xml.entities assocs splitting math.parser
+xml.entities assocs splitting math.parser
locals combinators arrays hints ;
IN: xml.tokenize
: assure-good-char ( spot ch -- )
[
- swap
+ over
[ version-1.0?>> over text? not ]
- [ check>> ] bi and [
- spot get [ 1 + ] change-column drop
+ [ check>> ] bi and
+ [
+ [ [ 1 + ] change-column drop ] dip
disallowed-char
- ] [ drop ] if
+ ] [ 2drop ] if
] [ drop ] if* ;
HINTS: assure-good-char { spot fixnum } ;
: record ( spot char -- spot )
over char>> [
- CHAR: \n =
+ CHAR: \n eq?
[ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if
>>column
] [ drop ] if ;
:: (next) ( spot -- spot char )
spot next>> :> old-next
spot stream>> stream-read1 :> new-next
- old-next CHAR: \r = [
+ old-next CHAR: \r eq? [
spot CHAR: \n >>char
- new-next CHAR: \n =
+ new-next CHAR: \n eq?
[ spot stream>> stream-read1 >>next ]
[ new-next >>next ] if
] [ spot old-next >>char new-next >>next ] if
: init-parser ( -- )
0 1 0 0 f t f <spot>
input-stream get >>stream
- spot set
- read1 set-next next ;
+ read1 >>next
+ spot set next ;
: with-state ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline
-:: (skip-until) ( ... quot: ( ... -- ... ? ) spot -- ... )
+:: (skip-until) ( ... quot: ( ... char -- ... ? ) spot -- ... )
spot char>> [
quot call [
spot next* quot spot (skip-until)
] unless
- ] when ; inline recursive
+ ] when* ; inline recursive
-: skip-until ( ... quot: ( ... -- ... ? ) -- ... )
+: skip-until ( ... quot: ( ... char -- ... ? ) -- ... )
spot get (skip-until) ; inline
-: take-until ( quot -- string )
+: take-until ( ... quot: ( ... char -- ... ? ) -- ... string )
#! Take the substring of a string starting at spot
#! from code until the quotation given is true and
#! advance spot to after the substring.
- 10 <sbuf> [
- spot get swap
- '[ @ [ t ] [ _ char>> _ push f ] if ] skip-until
- ] keep >string ; inline
+ 10 <sbuf> [
+ '[ _ keep over [ drop ] [ _ push ] if ] skip-until
+ ] keep >string ; inline
: take-to ( seq -- string )
- spot get swap '[ _ char>> _ member? ] take-until ;
+ '[ _ member? ] take-until ;
: pass-blank ( -- )
#! Advance code past any whitespace, including newlines
- spot get '[ _ char>> blank? not ] skip-until ;
+ [ blank? not ] skip-until ;
-: string-matches? ( string circular spot -- ? )
- char>> over circular-push sequence= ;
+: string-matcher ( str -- quot: ( pos char -- pos ? ) )
+ dup length 1 - '[
+ over _ nth eq? [ 1 + ] [ drop 0 ] if dup _ >
+ ] ; inline
: take-string ( match -- string )
- dup length <circular-string>
- spot get '[ 2dup _ string-matches? ] take-until nip
- dup length rot length 1 - - head
+ [ 0 swap string-matcher take-until nip ] keep
+ dupd [ length ] bi@ 1 - - head
get-char [ missing-close ] unless next ;
: expect ( string -- )
{
{ [ char not ] [ ] }
{ [ char quot call ] [ spot next* ] }
- { [ char CHAR: & = ] [
+ { [ char CHAR: & eq? ] [
accum parse-entity
quot accum spot (parse-char)
] }
- { [ in-dtd? get char CHAR: % = and ] [
+ { [ char CHAR: % eq? in-dtd? get and ] [
accum parse-pe
quot accum spot (parse-char)
] }
: parse-char ( quot: ( ch -- ? ) -- seq )
1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline
-: assure-no-]]> ( circular -- )
- "]]>" sequence= [ text-w/]]> ] when ;
+: assure-no-]]> ( pos char -- pos' )
+ over "]]>" nth eq? [ 1 + ] [ drop 0 ] if
+ dup 2 > [ text-w/]]> ] when ;
:: parse-text ( -- string )
- 3 f <array> <circular> :> circ
- depth get zero? :> no-text [| char |
- char circ circular-push
- circ assure-no-]]>
- no-text [ char blank? char CHAR: < = or [
- char 1string t pre/post-content
- ] unless ] when
- char CHAR: < =
+ 0 :> pos!
+ depth get zero? :> no-text
+ [| char |
+ pos char assure-no-]]> pos!
+ no-text [
+ char blank? char CHAR: < eq? or [
+ char 1string t pre/post-content
+ ] unless
+ ] when
+ char CHAR: < eq?
] parse-char ;
: close ( -- )
: (parse-quote) ( <-disallowed? ch -- string )
swap '[
- dup _ = [ drop t ]
- [ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if
+ dup _ eq? [ drop t ]
+ [ CHAR: < eq? _ and [ attr-w/< ] [ f ] if ] if
] parse-char normalize-quote get-char
[ unclosed-quote ] unless ; inline
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays io io.encodings.binary io.files
-io.streams.string kernel namespaces sequences strings
+io.streams.string kernel math namespaces sequences strings
io.encodings.utf8 xml.data xml.errors xml.elements ascii
xml.entities xml.state xml.autoencoding assocs xml.tokenize
combinators.short-circuit xml.name splitting
swap [ call ] keep ; inline
: xml-loop ( quot: ( xml-elem -- ) -- )
- parse-text call-under
- get-char [ make-tag call-under xml-loop ]
+ parse-text call-under get-char
+ [ make-tag call-under xml-loop ]
[ drop ] if ; inline recursive
: read-seq ( stream quot n -- seq )