Making XML 30% faster

author Daniel Ehrenberg <littledan@Macintosh-103.local>

Fri, 30 Jan 2009 03:41:08 +0000 (21:41 -0600)

committer Daniel Ehrenberg <littledan@Macintosh-103.local>

Fri, 30 Jan 2009 03:41:08 +0000 (21:41 -0600)
author Daniel Ehrenberg <littledan@Macintosh-103.local>
Fri, 30 Jan 2009 03:41:08 +0000 (21:41 -0600)
committer Daniel Ehrenberg <littledan@Macintosh-103.local>
Fri, 30 Jan 2009 03:41:08 +0000 (21:41 -0600)
diff --git a/basis/xml/char-classes/char-classes.factor b/basis/xml/char-classes/char-classes.factor

index b47d4c66df92e9fe1bd446fccd1940a01c123113..d510c8a881d47e8d9538db82b0653b0d1b7b3be3 100644 (file)
--- a/basis/xml/char-classes/char-classes.factor
+++ b/basis/xml/char-classes/char-classes.factor
@@ -1,6 +1,7 @@
-! Copyright (C) 2005, 2007 Daniel Ehrenberg
+! Copyright (C) 2005, 2009 Daniel Ehrenberg
  ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences unicode.syntax math math.order combinators ;
+USING: kernel sequences unicode.syntax math math.order combinators
+hints ;
  IN: xml.char-classes
  
  CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
@@ -31,3 +32,5 @@ CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
          { [ dup HEX: E000 < ] [ drop f ] }
          [ { HEX: FFFE HEX: FFFF } member? not ]
      } cond ;
+
+HINTS: text? { object fixnum } ;
diff --git a/basis/xml/errors/errors-tests.factor b/basis/xml/errors/errors-tests.factor

index 4204979941738a0462f18245479348e93943942b..8a469bc08fbe0d3f598b822e4b983d37d6ef6271 100644 (file)
--- a/basis/xml/errors/errors-tests.factor
+++ b/basis/xml/errors/errors-tests.factor
@@ -6,11 +6,11 @@ IN: xml.errors.tests
      '[ _ string>xml ] swap '[ _ = ] must-fail-with ;
  
  T{ no-entity f 1 10 "nbsp" } "<x>&nbsp;</x>" xml-error-test
-T{ mismatched f 1 8 T{ name f "" "x" "" } T{ name f "" "y" "" } }
+T{ mismatched f 1 7 T{ name f "" "x" "" } T{ name f "" "y" "" } }
      "<x></y>" xml-error-test
-T{ unclosed f 1 4 V{ T{ name f "" "x" "" } } } "<x>" xml-error-test
+T{ unclosed f 1 3 V{ T{ name f "" "x" "" } } } "<x>" xml-error-test
  T{ nonexist-ns f 1 5 "x" } "<x:y/>" xml-error-test
-T{ unopened f 1 5 } "</x>" xml-error-test
+T{ unopened f 1 4 } "</x>" xml-error-test
  T{ not-yes/no f 1 41 "maybe" }
      "<?xml version='1.0' standalone='maybe'?><x/>" xml-error-test
  T{ extra-attrs f 1 32 V{ T{ name f "" "foo" f } }
@@ -19,13 +19,13 @@ T{ bad-version f 1 28 "5 million" }
      "<?xml version='5 million'?><x/>" xml-error-test
  T{ notags f } "" xml-error-test
  T{ multitags } "<x/><y/>" xml-error-test
-T{ bad-prolog  f 1 26 T{ prolog f "1.0" "UTF-8" f } }
+T{ bad-prolog  f 1 25 T{ prolog f "1.0" "UTF-8" f } }
      "<x/><?xml version='1.0'?>" xml-error-test
  T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>"
      xml-error-test
  T{ pre/post-content f "x" t } "x<y/>" xml-error-test
  T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test
-T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test
+T{ unclosed-quote f 1 12 } "<x value='/>" xml-error-test
  T{ bad-name f 1 3 "-" } "<-/>" xml-error-test
  T{ quoteless-attr f 1 12 } "<x value=<->/>" xml-error-test
  T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
@@ -37,6 +37,6 @@ T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
  T{ pre/post-content f "&" t } "&#32;<x/>" xml-error-test
  T{ bad-doctype f 1 17 "a" } "<!DOCTYPE foo [ a ]><x/>" xml-error-test
  T{ bad-doctype f 1 22 T{ opener { name T{ name f "" "foo" "" } } { attrs T{ attrs } } } } "<!DOCTYPE foo [ <foo> ]><x/>" xml-error-test
-T{ disallowed-char f 1 3 1 } "<x>\u000001</x>" xml-error-test
-T{ missing-close f 1 9 } "<!-- foo" xml-error-test
+T{ disallowed-char f 1 4 1 } "<x>\u000001</x>" xml-error-test
+T{ missing-close f 1 8 } "<!-- foo" xml-error-test
  T{ misplaced-directive f 1 9 "ENTITY" } "<!ENTITY foo 'bar'><x/>" xml-error-test
diff --git a/basis/xml/state/state.factor b/basis/xml/state/state.factor

index eba94220e396210fcd477ffcf78bff41eb17e9ad..a31565c8ad0b2fb0e4d6618905c048ce0da95a75 100644 (file)
--- a/basis/xml/state/state.factor
+++ b/basis/xml/state/state.factor
@@ -1,9 +1,11 @@
  ! Copyright (C) 2005, 2009 Daniel Ehrenberg
  ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel namespaces io ;
+USING: accessors kernel namespaces io math ;
  IN: xml.state
  
-TUPLE: spot char line column next check version-1.0? ;
+TUPLE: spot
+    char { line fixnum } { column fixnum }
+    next check version-1.0? ;
  
  C: <spot> spot
  
diff --git a/basis/xml/tests/state-parser-tests.factor b/basis/xml/tests/state-parser-tests.factor

index 24c3bc4b690269c2e53a7fbff62356f73e661b23..7616efaf1d813cc86557cef87f82e8b351ec4d22 100644 (file)
--- a/basis/xml/tests/state-parser-tests.factor
+++ b/basis/xml/tests/state-parser-tests.factor
@@ -11,7 +11,7 @@ IN: xml.test.state
      1string take-to ;
  
  [ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
-[ 2 4 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
+[ 2 3 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
  [ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
  [ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
  [ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor

index 50ab43ca7b18b7020d55f19f25d85508d5e49a48..c38a313b80fca4b2e147bd9a2fe5f49bbec2ca91 100644 (file)
--- a/basis/xml/tokenize/tokenize.factor
+++ b/basis/xml/tokenize/tokenize.factor
@@ -3,111 +3,131 @@
  USING: namespaces xml.state kernel sequences accessors
  xml.char-classes xml.errors math io sbufs fry strings ascii
  circular xml.entities assocs make splitting math.parser
-locals combinators arrays ;
+locals combinators arrays hints ;
  IN: xml.tokenize
  
-: assure-good-char ( ch -- ch )
+! * Basic utility words
+
+: assure-good-char ( spot ch -- )
      [
-        version-1.0? over text? not get-check and
-        [ disallowed-char ] when
-    ] [ f ] if* ;
+        swap
+        [ version-1.0?>> over text? not ]
+        [ check>> ] bi and [
+            spot get [ 1+ ] change-column drop
+            disallowed-char
+        ] [ drop ] if
+    ] [ drop ] if* ;
+
+HINTS: assure-good-char { spot fixnum } ;
+
+: record ( spot char -- spot )
+    over char>> [
+        CHAR: \n =
+        [ [ 1+ ] change-line -1 ] [ dup column>> 1+ ] if
+        >>column
+    ] [ drop ] if ;
  
-! * Basic utility words
+HINTS: record { spot fixnum } ;
  
-: record ( char -- )
-    CHAR: \n =
-    [ 0 get-line 1+ set-line ] [ get-column 1+ ] if
-    set-column ;
+:: (next) ( spot -- spot char )
+    spot next>> :> old-next
+    read1 :> new-next
+    old-next CHAR: \r = [
+        spot CHAR: \n >>char
+        new-next CHAR: \n =
+        [ read1 >>next ]
+        [ new-next >>next ] if
+    ] [ spot old-next >>char new-next >>next ] if
+    spot next>> ; inline
  
-! (next) normalizes \r\n and \r
-: (next) ( -- char )
-    get-next read1
-    2dup swap CHAR: \r = [
-        CHAR: \n =
-        [ nip read1 ] [ nip CHAR: \n swap ] if
-    ] [ drop ] if
-    set-next dup set-char assure-good-char ;
+: next* ( spot -- )
+    dup char>> [ unexpected-end ] unless
+    (next) [ record ] keep assure-good-char ;
+
+HINTS: next* { spot } ;
  
  : next ( -- )
-    #! Increment spot.
-    get-char [ unexpected-end ] unless (next) record ;
+    spot get next* ;
  
  : init-parser ( -- )
-    0 1 0 f f t <spot> spot set
+    0 1 0 0 f t <spot> spot set
      read1 set-next next ;
  
  : with-state ( stream quot -- )
      ! with-input-stream implicitly creates a new scope which we use
      swap [ init-parser call ] with-input-stream ; inline
  
+:: (skip-until) ( quot: ( -- ? ) spot -- )
+    spot char>> [
+        quot call [
+            spot next* quot spot (skip-until)
+        ] unless
+    ] when ; inline recursive
+
  : skip-until ( quot: ( -- ? ) -- )
-    get-char [
-        [ call ] keep swap [ drop ] [
-            next skip-until
-        ] if
-    ] [ drop ] if ; inline recursive
+    spot get (skip-until) ; inline
  
  : take-until ( quot -- string )
      #! Take the substring of a string starting at spot
      #! from code until the quotation given is true and
      #! advance spot to after the substring.
      10 <sbuf> [
-        '[ @ [ t ] [ get-char _ push f ] if ] skip-until
+        spot get swap
+        '[ @ [ t ] [ _ char>> _ push f ] if ] skip-until
      ] keep >string ; inline
  
  : take-to ( seq -- string )
-    '[ get-char _ member? ] take-until ;
+    spot get swap '[ _ char>> _ member? ] take-until ;
  
  : pass-blank ( -- )
      #! Advance code past any whitespace, including newlines
-    [ get-char blank? not ] skip-until ;
+    spot get '[ _ char>> blank? not ] skip-until ;
  
-: string-matches? ( string circular -- ? )
-    get-char over push-circular
-    sequence= ;
+: string-matches? ( string circular spot -- ? )
+    char>> over push-circular sequence= ;
  
  : take-string ( match -- string )
      dup length <circular-string>
-    [ 2dup string-matches? ] take-until nip
+    spot get '[ 2dup _ string-matches? ] take-until nip
      dup length rot length 1- - head
      get-char [ missing-close ] unless next ;
  
  : expect ( string -- )
-    dup [ get-char next ] replicate 2dup =
-    [ 2drop ] [ expected ] if ;
+    dup spot get '[ _ [ char>> ] keep next* ] replicate
+    2dup = [ 2drop ] [ expected ] if ;
  
  ! Suddenly XML-specific
  
-: parse-named-entity ( string -- )
-    dup entities at [ , ] [
+: parse-named-entity ( accum string -- )
+    dup entities at [ swap push ] [
          dup extra-entities get at
-        [ % ] [ no-entity ] ?if
+        [ swap push-all ] [ no-entity ] ?if
      ] ?if ;
  
  : take-; ( -- string )
      next ";" take-to next ;
  
-: parse-entity ( -- )
+: parse-entity ( accum -- )
      take-; "#" ?head [
-        "x" ?head 16 10 ? base> ,
+        "x" ?head 16 10 ? base> swap push
      ] [ parse-named-entity ] if ;
  
-: parse-pe ( -- )
+: parse-pe ( accum -- )
      take-; dup pe-table get at
-    [ % ] [ no-entity ] ?if ;
+    [ swap push-all ] [ no-entity ] ?if ;
  
-:: (parse-char) ( quot: ( ch -- ? ) -- )
+:: (parse-char) ( quot: ( ch -- ? ) accum -- )
      get-char :> char
      {
          { [ char not ] [ ] }
          { [ char quot call ] [ next ] }
-        { [ char CHAR: & = ] [ parse-entity quot (parse-char) ] }
-        { [ in-dtd? get char CHAR: % = and ] [ parse-pe quot (parse-char) ] }
-        [ char , next quot (parse-char) ]
+        { [ char CHAR: & = ] [ accum parse-entity quot accum (parse-char) ] }
+        { [ in-dtd? get char CHAR: % = and ] [ accum parse-pe quot accum (parse-char) ] }
+        [ char accum push next quot accum (parse-char) ]
      } cond ; inline recursive
  
  : parse-char ( quot: ( ch -- ? ) -- seq )
-    [ (parse-char) ] "" make ; inline
+    1024 <sbuf> [ (parse-char) ] keep >string ; inline
  
  : assure-no-]]> ( circular -- )
      "]]>" sequence= [ text-w/]]> ] when ;
author	Daniel Ehrenberg <littledan@Macintosh-103.local>
	Fri, 30 Jan 2009 03:41:08 +0000 (21:41 -0600)
committer	Daniel Ehrenberg <littledan@Macintosh-103.local>
	Fri, 30 Jan 2009 03:41:08 +0000 (21:41 -0600)
basis/xml/char-classes/char-classes.factor		patch \| blob \| history
basis/xml/errors/errors-tests.factor		patch \| blob \| history
basis/xml/state/state.factor		patch \| blob \| history
basis/xml/tests/state-parser-tests.factor		patch \| blob \| history
basis/xml/tokenize/tokenize.factor		patch \| blob \| history