Adding word breaks to regexp

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Wed, 11 Mar 2009 20:51:54 +0000 (15:51 -0500)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Wed, 11 Mar 2009 20:51:54 +0000 (15:51 -0500)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Wed, 11 Mar 2009 20:51:54 +0000 (15:51 -0500)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Wed, 11 Mar 2009 20:51:54 +0000 (15:51 -0500)
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor

index 92887668881b330e0343636f795cdc72e28ec61c..ffaed2db62367001df0bec3c848bc9b05133ef84 100644 (file)
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@@ -58,8 +58,8 @@ M: from-to <times>
  : char-class ( ranges ? -- term )
      [ <or-class> ] dip [ <not-class> ] when ;
  
-TUPLE: lookahead term positive? ;
+TUPLE: lookahead term ;
  C: <lookahead> lookahead
  
-TUPLE: lookbehind term positive? ;
+TUPLE: lookbehind term ;
  C: <lookbehind> lookbehind
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index 4ddd47018998a52f9ea679bd251e8c65afbb2a23..1959a91cb56138ee8ea9d89e971d70c14a761aea 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
  control-character-class hex-digit-class java-blank-class c-identifier-class
  unmatchable-class terminator-class word-boundary-class ;
  
-SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ;
+SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ;
  
  TUPLE: range from to ;
  C: <range> range
diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor

index 0e0c0eaae6b155fda68ccd76ee2c5a661663bc8a..c837df0f0f2e887586ea77d396c2cf84d26b95a2 100644 (file)
--- a/basis/regexp/compiler/compiler.factor
+++ b/basis/regexp/compiler/compiler.factor
@@ -3,7 +3,7 @@
  USING: regexp.classes kernel sequences regexp.negation
  quotations assocs fry math locals combinators
  accessors words compiler.units kernel.private strings
-sequences.private arrays call namespaces
+sequences.private arrays call namespaces unicode.breaks
  regexp.transition-tables combinators.short-circuit ;
  IN: regexp.compiler
  
@@ -15,6 +15,10 @@ SYMBOL: backwards?
  <PRIVATE
  
  M: t question>quot drop [ 2drop t ] ;
+M: f question>quot drop [ 2drop f ] ;
+
+M: not-class question>quot
+    class>> question>quot [ not ] compose ;
  
  M: beginning-of-input question>quot
      drop [ drop zero? ] ;
@@ -36,6 +40,9 @@ M: $ question>quot
  M: ^ question>quot
      drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
  
+M: word-break question>quot
+    drop [ word-break-at? ] ;
+
  : (execution-quot) ( next-state -- quot )
      ! The conditions here are for lookaround and anchors, etc
      dup condition? [
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index adbf0c53d33f475c6537cc95bbdab89c8ce4379d..c6a69f250875a2ddf999844f19c10a0f79dda013 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -56,6 +56,8 @@ ERROR: bad-class name ;
          { CHAR: z [ end-of-input <tagged-epsilon> ] }
          { CHAR: Z [ end-of-file <tagged-epsilon> ] }
          { CHAR: A [ beginning-of-input <tagged-epsilon> ] }
+        { CHAR: b [ word-break <tagged-epsilon> ] }
+        { CHAR: B [ word-break <not-class> <tagged-epsilon> ] }
          [ ]
      } case ;
  
@@ -138,10 +140,10 @@ Parenthized = "?:" Alternation:a => [[ a ]]
                  => [[ a on off parse-options <with-options> ]]
              | "?#" [^)]* => [[ f ]]
              | "?~" Alternation:a => [[ a <negation> ]]
-            | "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]]
-            | "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]]
-            | "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]]
-            | "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]]
+            | "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
+            | "?!" Alternation:a => [[ a <lookahead> <not-class> <tagged-epsilon> ]]
+            | "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
+            | "?<!" Alternation:a => [[ a <lookbehind> <not-class> <tagged-epsilon> ]]
              | Alternation
  
  Element = "(" Parenthized:p ")" => [[ p ]]
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index e01241552dbbd957b9900ded1e2559861c1ebfc3..0b94f8296d9c0bbfc668f15d74f9ff0e694acf38 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -433,24 +433,24 @@ IN: regexp-tests
  
  [ { "foo" "fxx" "fab" } ] [ "fab fxx foo" R/ f../r all-matches [ >string ] map ] unit-test
  
-! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
-! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
-! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
-! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
-
-! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
-! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
-! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
-! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
-! [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
-! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
-
-! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
-! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
-! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
-! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
-! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
-! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
+[ t ] [ "foo" "\\bfoo\\b" <regexp> re-contains? ] unit-test
+[ t ] [ "afoob" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
+[ f ] [ "afoob" "\\bfoo\\b" <regexp> re-contains? ] unit-test
+[ f ] [ "foo" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
+
+[ 3 ] [ "foo bar" "foo\\b" <regexp> first-match length ] unit-test
+[ f ] [ "fooxbar" "foo\\b" <regexp> re-contains? ] unit-test
+[ t ] [ "foo" "foo\\b" <regexp> re-contains? ] unit-test
+[ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
+[ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
+[ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
+
+[ f ] [ "foo bar" "foo\\B" <regexp> re-contains? ] unit-test
+[ 3 ] [ "fooxbar" "foo\\B" <regexp> first-match length ] unit-test
+[ f ] [ "foo" "foo\\B" <regexp> re-contains? ] unit-test
+[ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
+[ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
+[ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
  
  ! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
  ! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor

index 7f27a13104a011f67505e454273f5dcbbdc9536a..a7f2fa4e12839d679e33b4e7c17fe4c3ec055f48 100644 (file)
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@@ -17,21 +17,16 @@ TUPLE: reverse-regexp < regexp ;
  
  <PRIVATE
  
-: maybe-negated ( lookaround quot -- regexp-quot )
-    '[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
-
  M: lookahead question>quot ! Returns ( index string -- ? )
-    [ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ;
+    term>> ast>dfa dfa>shortest-word '[ f _ execute ] ;
  
  : <reversed-option> ( ast -- reversed )
      "r" string>options <with-options> ;
  
  M: lookbehind question>quot ! Returns ( index string -- ? )
-    [
-        <reversed-option>
-        ast>dfa dfa>reverse-shortest-word
-        '[ [ 1- ] dip f _ execute ]
-    ] maybe-negated ;
+    term>> <reversed-option>
+    ast>dfa dfa>reverse-shortest-word
+    '[ [ 1- ] dip f _ execute ] ;
  
  : check-string ( string -- string )
      ! Make this configurable
diff --git a/basis/unicode/breaks/breaks-tests.factor b/basis/unicode/breaks/breaks-tests.factor

index d8e220cf1816903c568ad057922fa934c8b59b06..493c2db0c2c7fa2efcfde51dcb3d9b1652bcd18d 100644 (file)
--- a/basis/unicode/breaks/breaks-tests.factor
+++ b/basis/unicode/breaks/breaks-tests.factor
@@ -37,3 +37,5 @@ IN: unicode.breaks.tests
  
  grapheme-break-test parse-test-file [ >graphemes ] test
  word-break-test parse-test-file [ >words ] test
+
+[ { t f t t f t } ] [ 6 [ "as df" word-break-at? ] map ] unit-test
diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor

index ddcb99b829dba82cbd772e004f786348d06f1c8b..f2e94545455972ba712c954d1d714b01db6d6ff3 100644 (file)
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@@ -228,3 +228,20 @@ PRIVATE>
  
  : >words ( str -- words )
      [ first-word ] >pieces ;
+
+<PRIVATE
+
+: nth-next ( i str -- str[i-1] str[i] )
+    [ [ 1- ] keep ] dip '[ _ nth ] bi@ ;
+
+PRIVATE>
+
+: word-break-at? ( i str -- ? )
+    {
+        [ drop zero? ]
+        [ length = ]
+        [
+            [ nth-next [ word-break-prop ] dip ] 2keep
+            word-break-next nip
+        ]
+    } 2|| ;
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Wed, 11 Mar 2009 20:51:54 +0000 (15:51 -0500)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Wed, 11 Mar 2009 20:51:54 +0000 (15:51 -0500)
basis/regexp/ast/ast.factor		patch \| blob \| history
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/compiler/compiler.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history
basis/regexp/regexp.factor		patch \| blob \| history
basis/unicode/breaks/breaks-tests.factor		patch \| blob \| history
basis/unicode/breaks/breaks.factor		patch \| blob \| history