Initial optimization of encodings

author Daniel Ehrenberg <ehrenbed@carleton.edu>

Tue, 18 Mar 2008 21:01:14 +0000 (17:01 -0400)

committer Daniel Ehrenberg <ehrenbed@carleton.edu>

Tue, 18 Mar 2008 21:01:14 +0000 (17:01 -0400)
author Daniel Ehrenberg <ehrenbed@carleton.edu>
Tue, 18 Mar 2008 21:01:14 +0000 (17:01 -0400)
committer Daniel Ehrenberg <ehrenbed@carleton.edu>
Tue, 18 Mar 2008 21:01:14 +0000 (17:01 -0400)
diff --git a/core/io/encodings/encodings-docs.factor b/core/io/encodings/encodings-docs.factor

index e5e71b05f0677b5bb1a4d15dafa1b2b512a521dc..548d2cd7fcc045de73161e2d3da00ec47c8ae21c 100644 (file)
--- a/core/io/encodings/encodings-docs.factor
+++ b/core/io/encodings/encodings-docs.factor
@@ -44,25 +44,21 @@ $nl { $vocab-link "io.encodings.utf16" } ;
  
  ARTICLE: "encodings-protocol" "Encoding protocol"
  "An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
-{ $subsection decode-step }
-{ $subsection init-decoder }
-{ $subsection stream-write-encoded } ;
+{ $subsection decode-char }
+{ $subsection encode-char }
+"The following methods are optional:"
+{ $subsection <encoder> }
+{ $subsection <decoder> } ;
  
-HELP: decode-step ( buf char encoding -- )
-{ $values { "buf" "A string buffer which characters can be pushed to" }
-    { "char" "An octet which is read from a stream" }
+HELP: decode-char ( stream encoding -- char/f )
+{ $values { "stream" "an underlying input stream" }
      { "encoding" "An encoding descriptor tuple" } }
-{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change. This should not be used directly." } ;
-
-HELP: stream-write-encoded ( string stream encoding -- )
-{ $values { "string" "a string" }
-    { "stream" "an output stream" }
-    { "encoding" "an encoding descriptor" } }
-{ $description "Encodes the string with the given encoding descriptor, outputing the result to the given stream. This should not be used directly." } ;
+{ $description "Reads a single code point from the underlying stream, interpreting it by the encoding. This should not be used directly." } ;
  
-HELP: init-decoder ( stream encoding -- encoding )
-{ $values { "stream" "an input stream" }
+HELP: encode-char ( char stream encoding -- )
+{ $values { "char" "a character" }
+    { "stream" "an underlying output stream" }
      { "encoding" "an encoding descriptor" } }
-{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM. This should not be used directly." } ;
+{ $description "Writes the code point in the encoding to the underlying stream given. This should not be used directly." } ;
  
-{ init-decoder decode-step stream-write-encoded } related-words
+{ encode-char decode-char } related-words
diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor

index b7c71d5527644ee50267a30d4c20b85a2056a446..4cd43ef4553c48802222b1dbb65f2e054b8768d5 100755 (executable)
--- a/core/io/encodings/encodings.factor
+++ b/core/io/encodings/encodings.factor
@@ -61,25 +61,28 @@ M: tuple <decoder> f decoder construct-boa ;
      ] when nip ;
  
  : read-loop ( n stream -- string )
-    over 0 <string> [
+    SBUF" " clone [
          [
-            >r stream-read1 dup
-            [ swap r> set-nth-unsafe f ] [ r> 3drop t ] if
-        ] 2curry find-integer
-    ] keep swap [ head ] when* ;
+            >r nip stream-read1 dup
+            [ r> push f ] [ r> 2drop t ] if
+        ] 2curry find-integer drop
+    ] keep "" like f like ;
  
  M: decoder stream-read
      tuck read-loop fix-read ;
  
+M: decoder stream-read-partial stream-read ;
+
  : (read-until) ( buf quot -- string/f sep/f )
-    ! quot: -- char keep-going?
+    ! quot: -- char stop?
      dup call
      [ >r drop "" like r> ]
      [ pick push (read-until) ] if ; inline
  
  M: decoder stream-read-until
      SBUF" " clone -rot >decoder<
-    [ decode-char dup rot memq? ] 3curry (read-until) ;
+    [ decode-char [ dup rot memq? ] [ drop f t ] if* ] 3curry
+    (read-until) ;
  
  : fix-read1 ( stream char -- char )
      over decoder-cr [
@@ -118,6 +121,8 @@ M: encoder stream-write
  
  M: encoder dispose encoder-stream dispose ;
  
+M: encoder stream-flush encoder-stream stream-flush ;
+
  INSTANCE: encoder plain-writer
  
  ! Rebinding duplex streams which have not read anything yet
diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor

index 02b10c45a5759d6d4198802d569cf0446223e507..e98860f25dffed7382c91ad7b291c605ba985dce 100644 (file)
--- a/core/io/encodings/utf8/utf8.factor
+++ b/core/io/encodings/utf8/utf8.factor
@@ -15,7 +15,7 @@ TUPLE: utf8 ;
  
  : append-nums ( stream byte -- stream char )
      over stream-read1 dup starts-2?
-    [ 6 shift swap BIN: 111111 bitand bitor ]
+    [ swap 6 shift swap BIN: 111111 bitand bitor ]
      [ 2drop replacement-char ] if ;
  
  : double ( stream byte -- stream char )
diff --git a/core/io/streams/byte-array/byte-array.factor b/core/io/streams/byte-array/byte-array.factor

index d5ca8eac6867127b849807a5189b34de06f84194..2a8441ff23d6894fc6df312862603969740ccf1f 100644 (file)
--- a/core/io/streams/byte-array/byte-array.factor
+++ b/core/io/streams/byte-array/byte-array.factor
@@ -1,5 +1,5 @@
  USING: byte-arrays byte-vectors kernel io.encodings io.streams.string
-sequences io namespaces ;
+sequences io namespaces io.encodings.private ;
  IN: io.streams.byte-array
  
  : <byte-writer> ( encoding -- stream )
@@ -7,7 +7,7 @@ IN: io.streams.byte-array
  
  : with-byte-writer ( encoding quot -- byte-array )
      >r <byte-writer> r> [ stdio get ] compose with-stream*
-    >byte-array ; inline
+    dup encoder? [ encoder-stream ] when >byte-array ; inline
  
  : <byte-reader> ( byte-array encoding -- stream )
      >r >byte-vector dup reverse-here r> <decoder> ;
diff --git a/core/io/streams/string/string.factor b/core/io/streams/string/string.factor

index 33404292a996f1ac45a1a7150c196e672a461eac..b7ff37a97190bb4d65ed1930edfd4e79c0fe1b13 100755 (executable)
--- a/core/io/streams/string/string.factor
+++ b/core/io/streams/string/string.factor
@@ -49,8 +49,11 @@ M: growable stream-read
  M: growable stream-read-partial
      stream-read ;
  
+TUPLE: null ;
+M: null decode-char drop stream-read1 ;
+
  : <string-reader> ( str -- stream )
-    >sbuf dup reverse-here f <decoder> ;
+    >sbuf dup reverse-here null <decoder> ;
  
  : with-string-reader ( str quot -- )
      >r <string-reader> r> with-stream ; inline
diff --git a/extra/io/encodings/ascii/ascii.factor b/extra/io/encodings/ascii/ascii.factor

index 16d87ef39c4cb9314813f651a871feebeb7dd659..d3fe51f28d1cdbcb790e08649fb684736c175433 100644 (file)
--- a/extra/io/encodings/ascii/ascii.factor
+++ b/extra/io/encodings/ascii/ascii.factor
@@ -1,14 +1,16 @@
  ! Copyright (C) 2008 Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
-USING: io io.encodings kernel math ;
+USING: io io.encodings kernel math io.encodings.private ;
  IN: io.encodings.ascii
  
  <PRIVATE
  : encode-if< ( char stream encoding max -- )
-    nip pick > [ encode-error ] [ stream-write1 ] if ;
+    nip 1- pick < [ encode-error ] [ stream-write1 ] if ;
  
  : decode-if< ( stream encoding max -- character )
-    nip swap stream-read1 tuck > [ drop replacement-character ] unless ;
+    nip swap stream-read1
+    [ tuck > [ drop replacement-char ] unless ]
+    [ drop f ] if* ;
  PRIVATE>
  
  TUPLE: ascii ;
diff --git a/extra/io/encodings/utf16/utf16.factor b/extra/io/encodings/utf16/utf16.factor

index 7e82935db7904c561f74e2b9c48c53d5219275e1..290761ec91e7985752708bf841d52f34db0cd82e 100755 (executable)
--- a/extra/io/encodings/utf16/utf16.factor
+++ b/extra/io/encodings/utf16/utf16.factor
@@ -1,14 +1,14 @@
  ! Copyright (C) 2006, 2008 Daniel Ehrenberg.
  ! See http://factorcode.org/license.txt for BSD license.
  USING: math kernel sequences sbufs vectors namespaces io.binary
-io.encodings combinators splitting io byte-arrays ;
+io.encodings combinators splitting io byte-arrays inspector ;
  IN: io.encodings.utf16
  
  TUPLE: utf16be ;
  
-TUPLE: utf16le ch state ;
+TUPLE: utf16le ;
  
-TUPLE: utf16 started? ;
+TUPLE: utf16 ;
  
  <PRIVATE
  
@@ -21,12 +21,12 @@ TUPLE: utf16 started? ;
      over stream-read1 swap append-nums ;
  
  : quad-be ( stream byte -- stream char )
-    double-be over stream-read1 dup [
+    double-be over stream-read1 [
          dup -2 shift BIN: 110111 number= [
              >r 2 shift r> BIN: 11 bitand bitor
              over stream-read1 swap append-nums HEX: 10000 +
-        ] [ 2drop replacement-char ] if
-    ] when ;
+        ] [ 2drop dup stream-read1 drop replacement-char ] if
+    ] when* ;
  
  : ignore ( stream -- stream char )
      dup stream-read1 drop replacement-char ;
@@ -38,7 +38,7 @@ TUPLE: utf16 started? ;
          [ drop ignore ] if
      ] [ double-be ] if ;
      
-M: decode-char
+M: utf16be decode-char
      drop dup stream-read1 dup [ begin-utf16be ] when nip ;
  
  ! UTF-16LE decoding
@@ -54,59 +54,48 @@ M: decode-char
          dup BIN: 100 bitand 0 number=
          [ BIN: 11 bitand 8 shift bitor quad-le ]
          [ 2drop replacement-char ] if
-    ] [ swap append-nums ] if ;
-
-: decode-utf16le-step ( buf byte ch state -- buf ch state )
-    {
-        { begin [ drop double ] }
-        { double [ handle-double ] }
-        { quad2 [ 10 shift bitor quad3 ] }
-        { quad3 [ handle-quad3le ] }
-    } case ;
+    ] [ append-nums ] if ;
  
  : begin-utf16le ( stream byte -- stream char )
-    over stream-read1 [ double-le ] [ drop replacement-char ] if*
+    over stream-read1 [ double-le ] [ drop replacement-char ] if* ;
  
-M: decode-char
+M: utf16le decode-char
      drop dup stream-read1 dup [ begin-utf16le ] when nip ;
  
  ! UTF-16LE/BE encoding
  
-: encode-first
+: encode-first ( char -- byte1 byte2 )
      -10 shift
      dup -8 shift BIN: 11011000 bitor
      swap HEX: FF bitand ;
  
-: encode-second
+: encode-second ( char -- byte3 byte4 )
      BIN: 1111111111 bitand
      dup -8 shift BIN: 11011100 bitor
      swap BIN: 11111111 bitand ;
  
  : stream-write2 ( stream char1 char2 -- )
-    rot [ stream-write1 ] 2apply ;
+    rot [ stream-write1 ] curry 2apply ;
  
  : char>utf16be ( stream char -- )
      dup HEX: FFFF > [
          HEX: 10000 -
-        dup encode-first stream-write2
+        2dup encode-first stream-write2
          encode-second stream-write2
      ] [ h>b/b swap stream-write2 ] if ;
  
  M: utf16be encode-char ( char stream encoding -- )
-    drop char>utf16be ;
+    drop swap char>utf16be ;
  
-: char>utf16le ( char -- )
+: char>utf16le ( char stream -- )
      dup HEX: FFFF > [
          HEX: 10000 -
-        dup encode-first swap stream-write2
+        2dup encode-first swap stream-write2
          encode-second swap stream-write2
      ] [ h>b/b stream-write2 ] if ; 
  
-: stream-write-utf16le ( string stream -- )
-    [ [ char>utf16le ] each ] with-stream* ;
-
-M: utf16le stream-write-encoded ( string stream encoding -- )
-    drop stream-write-utf16le ;
+M: utf16le encode-char ( char stream encoding -- )
+    drop swap char>utf16le ;
  
  ! UTF-16
  
@@ -118,13 +107,16 @@ M: utf16le stream-write-encoded ( string stream encoding -- )
  
  : start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
  
+TUPLE: missing-bom ;
+M: missing-bom summary drop "The BOM for a UTF-16 stream was missing" ;
+
  : bom>le/be ( bom -- le/be )
      dup bom-le sequence= [ drop utf16le ] [
-        bom-be sequence= [ utf16be ] [ decode-error ] if
+        bom-be sequence= [ utf16be ] [ missing-bom ] if
      ] if ;
  
  M: utf16 <decoder> ( stream utf16 -- decoder )
-    2 rot stream-read bom>le/be <decoder> ;
+    drop 2 over stream-read bom>le/be <decoder> ;
  
  M: utf16 <encoder> ( stream utf16 -- encoder )
      drop bom-le over stream-write utf16le <encoder> ;
diff --git a/extra/io/unix/launcher/launcher-tests.factor b/extra/io/unix/launcher/launcher-tests.factor

index aa54d3ec9435594d3fa9d53343bf72964d48b8be..5370817d2f0c7b5979a579cf1f89891c77e6ccab 100644 (file)
--- a/extra/io/unix/launcher/launcher-tests.factor
+++ b/extra/io/unix/launcher/launcher-tests.factor
@@ -1,6 +1,6 @@
  IN: io.unix.launcher.tests
  USING: io.files tools.test io.launcher arrays io namespaces
-continuations math io.encodings.ascii io.encodings.latin1
+continuations math io.encodings.binary io.encodings.ascii
  accessors kernel sequences ;
  
  [ ] [
@@ -64,7 +64,7 @@ accessors kernel sequences ;
  
  [ ] [
      2 [
-        "launcher-test-1" temp-file ascii <file-appender> [
+        "launcher-test-1" temp-file binary <file-appender> [
              <process>
                  swap >>stdout
                  "echo Hello" >>command
@@ -84,7 +84,7 @@ accessors kernel sequences ;
      <process>
          "env" >>command
          { { "A" "B" } } >>environment
-    latin1 <process-stream> lines
+    ascii <process-stream> lines
      "A=B" swap member?
  ] unit-test
  
@@ -93,5 +93,5 @@ accessors kernel sequences ;
          "env" >>command
          { { "A" "B" } } >>environment
          +replace-environment+ >>environment-mode
-    latin1 <process-stream> lines
+    ascii <process-stream> lines
  ] unit-test
author	Daniel Ehrenberg <ehrenbed@carleton.edu>
	Tue, 18 Mar 2008 21:01:14 +0000 (17:01 -0400)
committer	Daniel Ehrenberg <ehrenbed@carleton.edu>
	Tue, 18 Mar 2008 21:01:14 +0000 (17:01 -0400)
core/io/encodings/encodings-docs.factor		patch \| blob \| history
core/io/encodings/encodings.factor		patch \| blob \| history
core/io/encodings/utf8/utf8.factor		patch \| blob \| history
core/io/streams/byte-array/byte-array.factor		patch \| blob \| history
core/io/streams/string/string.factor		patch \| blob \| history
extra/io/encodings/ascii/ascii.factor		patch \| blob \| history
extra/io/encodings/utf16/utf16.factor		patch \| blob \| history
extra/io/unix/launcher/launcher-tests.factor		patch \| blob \| history