-USING: xml xml.data xml.utilities tools.test ;
+USING: xml xml.data xml.utilities tools.test accessors kernel ;
[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/spaces.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test
[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test
+[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test
\ No newline at end of file
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays ascii assocs combinators fry io.encodings
-io.encodings.iana io.encodings.utf16 io.encodings.utf8 kernel
-make math.parser namespaces sequences sets splitting state-parser
-xml.char-classes xml.data xml.entities xml.errors strings ;
+USING: accessors arrays ascii assocs combinators
+combinators.short-circuit fry io.encodings io.encodings.iana
+io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make
+math math.parser namespaces sequences sets splitting state-parser
+strings xml.char-classes xml.data xml.entities xml.errors ;
IN: xml.tokenize
! XML namespace processing: ns = namespace
! version=1.0? is calculated once and passed around for efficiency
-: (parse-name) ( -- str )
- version=1.0? dup
- get-char name-start? [
- [ dup get-char name-char? not ] take-until nip
- ] [
- "Malformed name" xml-string-error
- ] if ;
+: assure-name ( str version=1.0? -- str )
+ over {
+ [ first name-start? ]
+ [ rest-slice [ name-char? ] with all? ]
+ } 2&& [ "Malformed name" xml-string-error ] unless ;
-: parse-name ( -- name )
+: (parse-name) ( start -- str )
+ version=1.0?
+ [ [ get-char name-char? not ] curry take-until append ]
+ [ assure-name ] bi ;
+
+: parse-name-starting ( start -- name )
(parse-name) get-char CHAR: : =
- [ next (parse-name) ] [ "" swap ] if f <name> ;
+ [ next "" (parse-name) ] [ "" swap ] if f <name> ;
+
+: parse-name ( -- name )
+ "" parse-name-starting ;
! -- Parsing strings
: parse-text ( -- string )
CHAR: < parse-char ;
-
+
! Parsing tags
: start-tag ( -- name ? )
dup prolog-data set ;
: instruct ( -- instruction )
- (parse-name) dup "xml" =
+ "" (parse-name) dup "xml" =
[ drop parse-prolog ] [
dup >lower "xml" =
[ capitalized-prolog ]
! Autodetecting encodings
+: continue-make-tag ( str -- tag )
+ parse-name-starting middle-tag end-tag CHAR: > expect ;
+
: start-utf16le ( -- tag )
utf16le decode-input-if
CHAR: ? expect
0 expect instruct ;
+: 10xxxxxx? ( ch -- ? )
+ -6 shift 3 bitand 2 = ;
+
+: start<name ( ch -- tag )
+ ascii?
+ [ utf8 decode-input-if next make-tag ] [
+ next
+ [ get-next 10xxxxxx? not ] take-until
+ get-char suffix utf8 decode
+ utf8 decode-input-if next
+ continue-make-tag
+ ] if ;
+
: start< ( -- tag )
get-next {
{ 0 [ next next start-utf16le ] }
{ CHAR: ? [ next next instruct ] } ! XML prolog parsing sets the encoding
- [ drop utf8 decode-input-if next make-tag ]
- ! That is a hack. It fails if you have <nonascii
+ { CHAR: ! [ utf8 decode-input next next direct ] }
+ [ start<name ]
} case ;
: skip-utf8-bom ( -- tag )