]> gitweb.factorcode.org Git - factor.git/commitdiff
New extra/tr/ vocab for fast translation of ASCII strings; improves reverse-complemen...
authorSlava Pestov <slava@slava-pestovs-macbook-pro.local>
Thu, 10 Jul 2008 00:25:24 +0000 (19:25 -0500)
committerSlava Pestov <slava@slava-pestovs-macbook-pro.local>
Thu, 10 Jul 2008 00:25:24 +0000 (19:25 -0500)
extra/benchmark/reverse-complement/reverse-complement.factor
extra/soundex/author.txt [new file with mode: 0644]
extra/soundex/soundex-tests.factor [new file with mode: 0644]
extra/soundex/soundex.factor [new file with mode: 0644]
extra/soundex/summary.txt [new file with mode: 0644]
extra/tr/authors.txt [new file with mode: 0644]
extra/tr/summary.txt [new file with mode: 0644]
extra/tr/tr-tests.factor [new file with mode: 0644]
extra/tr/tr.factor [new file with mode: 0644]

index b7c1db043cc89e82035a3b38469ec984de3fc75d..665cbba30d60d9b5f234f7cb25f18aab0fc5ffd5 100755 (executable)
@@ -1,30 +1,20 @@
 USING: io io.files io.streams.duplex kernel sequences
 sequences.private strings vectors words memoize splitting
-grouping hints unicode.case continuations io.encodings.ascii ;
+grouping hints tr continuations io.encodings.ascii
+unicode.case ;
 IN: benchmark.reverse-complement
 
-MEMO: trans-map ( -- str )
-    256 >string
-    "TGCAAKYRMBDHV" "ACGTUMRYKVHDB"
-    [ pick set-nth ] 2each ;
-
-: do-trans-map ( str -- )
-    [ ch>upper trans-map nth ] change-each ;
-
-HINTS: do-trans-map string ;
+TR: trans-map ch>upper "ACGTUMRYKVHDB" "TGCAAKYRMBDHV" ;
 
 : translate-seq ( seq -- str )
-    concat dup reverse-here dup do-trans-map ;
+    concat dup reverse-here dup trans-map-fast ;
 
 : show-seq ( seq -- )
     translate-seq 60 <groups> [ print ] each ;
 
 : do-line ( seq line -- seq )
-    dup first ">;" memq? [
-        over show-seq print dup delete-all
-    ] [
-        over push
-    ] if ;
+    dup first ">;" memq?
+    [ over show-seq print dup delete-all ] [ over push ] if ;
 
 HINTS: do-line vector string ;
 
diff --git a/extra/soundex/author.txt b/extra/soundex/author.txt
new file mode 100644 (file)
index 0000000..1901f27
--- /dev/null
@@ -0,0 +1 @@
+Slava Pestov
diff --git a/extra/soundex/soundex-tests.factor b/extra/soundex/soundex-tests.factor
new file mode 100644 (file)
index 0000000..df6338c
--- /dev/null
@@ -0,0 +1,4 @@
+IN: soundex.tests
+USING: soundex tools.test ;
+
+[ "S162" ] [ "supercalifrag" soundex ] unit-test
diff --git a/extra/soundex/soundex.factor b/extra/soundex/soundex.factor
new file mode 100644 (file)
index 0000000..c82825d
--- /dev/null
@@ -0,0 +1,33 @@
+! Copyright (C) 2008 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: sequences sequences.lib grouping assocs kernel ascii
+unicode.case tr ;
+IN: soundex
+
+TR: soundex-tr
+    ch>upper
+    "AEHIOUWYBFPVCGJKQSXZDTLMNR"
+    "00000000111122222222334556" ;
+
+: remove-duplicates ( seq -- seq' )
+    #! Remove _consecutive_ duplicates (unlike prune which removes
+    #! all duplicates).
+    [ 2 <clumps> [ = not ] assoc-filter values ] [ first ] bi prefix ;
+
+: first>upper ( seq -- seq' ) 1 head >upper ;
+: trim-first ( seq -- seq' ) dup first [ = ] curry left-trim ;
+: remove-zeroes ( seq -- seq' ) CHAR: 0 swap remove ;
+: remove-non-alpha ( seq -- seq' ) [ alpha? ] filter ;
+: pad-4 ( first seq -- seq' ) "000" 3append 4 head ;
+
+: soundex ( string -- soundex )
+    remove-non-alpha [ f ] [
+        [ first>upper ]
+        [
+            soundex-tr
+            trim-first
+            remove-duplicates
+            remove-zeroes
+        ] bi
+        pad-4
+    ] if-empty ;
diff --git a/extra/soundex/summary.txt b/extra/soundex/summary.txt
new file mode 100644 (file)
index 0000000..95a271d
--- /dev/null
@@ -0,0 +1 @@
+Soundex is a phonetic algorithm for indexing names by sound
diff --git a/extra/tr/authors.txt b/extra/tr/authors.txt
new file mode 100644 (file)
index 0000000..1901f27
--- /dev/null
@@ -0,0 +1 @@
+Slava Pestov
diff --git a/extra/tr/summary.txt b/extra/tr/summary.txt
new file mode 100644 (file)
index 0000000..8678446
--- /dev/null
@@ -0,0 +1 @@
+Fast character-to-character translation of ASCII strings
diff --git a/extra/tr/tr-tests.factor b/extra/tr/tr-tests.factor
new file mode 100644 (file)
index 0000000..1eea69b
--- /dev/null
@@ -0,0 +1,7 @@
+IN: tr.tests
+USING: tr tools.test unicode.case ;
+
+TR: tr-test ch>upper "ABC" "XYZ" ;
+
+[ "XXYY" ] [ "aabb" tr-test ] unit-test
+[ "XXYY" ] [ "AABB" tr-test ] unit-test
diff --git a/extra/tr/tr.factor b/extra/tr/tr.factor
new file mode 100644 (file)
index 0000000..a95d308
--- /dev/null
@@ -0,0 +1,35 @@
+! Copyright (C) 2008 Slava Pestov.
+! See http://factorcode.org/license.txt for BSD license.
+USING: byte-arrays strings sequences sequences.private
+fry kernel words parser lexer assocs ;
+IN: tr
+
+<PRIVATE
+
+: compute-tr ( quot from to -- mapping )
+    zip [ 256 ] 2dip '[ [ @ , at ] keep or ] B{ } map-as ; inline
+
+: tr-hints ( word -- )
+    { { byte-array } { string } } "specializer" set-word-prop ;
+
+: create-tr ( token -- word )
+    create-in dup tr-hints ;
+
+: define-tr ( word mapping -- )
+    '[ [ , nth ] map ]
+    (( seq -- translated ))
+    define-declared ;
+
+: define-fast-tr ( word mapping -- )
+    '[ [ , nth-unsafe ] change-each ]
+    (( seq -- ))
+    define-declared ;
+
+PRIVATE>
+
+: TR:
+    scan parse-definition
+    unclip-last [ unclip-last ] dip compute-tr
+    [ [ create-tr ] dip define-tr ]
+    [ [ "-fast" append create-tr ] dip define-fast-tr ] 2bi ;
+    parsing