Regexp supports Unicode properties (categories and script)

author Daniel Ehrenberg <littledan@Macintosh-122.local>

Sat, 21 Mar 2009 00:03:26 +0000 (19:03 -0500)

committer Daniel Ehrenberg <littledan@Macintosh-122.local>

Sat, 21 Mar 2009 00:03:26 +0000 (19:03 -0500)
author Daniel Ehrenberg <littledan@Macintosh-122.local>
Sat, 21 Mar 2009 00:03:26 +0000 (19:03 -0500)
committer Daniel Ehrenberg <littledan@Macintosh-122.local>
Sat, 21 Mar 2009 00:03:26 +0000 (19:03 -0500)
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor

index 28b0ed1563441aa7a410fa04fc50e434c6f685cd..e114dea26013f5be5123d060645bd141f6b2ecf4 100644 (file)
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -108,21 +108,24 @@ M: terminator-class class-member? ( obj class -- ? )
  
  M: f class-member? 2drop f ;
  
+: same? ( obj1 obj2 quot1: ( obj1 -- val1 ) quot2: ( obj2 -- val2 ) -- ? )
+    bi* = ; inline
+
  M: script-class class-member?
-    [ script-of ] [ script>> ] bi* = ;
+    [ script-of ] [ script>> ] same? ;
  
  M: category-class class-member?
-    [ category# ] [ category>> ] bi* = ;
+    [ category ] [ category>> ] same? ;
  
  M: category-range-class class-member?
-    [ category first ] [ category>> ] bi* = ;
+    [ category first ] [ category>> ] same? ;
  
  TUPLE: not-class class ;
  
  PREDICATE: not-integer < not-class class>> integer? ;
  
  UNION: simple-class
-    primitive-class range-class category-class category-range-class dot ;
+    primitive-class range-class dot ;
  PREDICATE: not-simple < not-class class>> simple-class? ;
  
  M: not-class class-member?
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor

index bf5465e0e2607f0e8142360dfa9f84dde122c81a..e8de469a9475cb7dcce7026b9f4768378a0e101a 100644 (file)
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -2,7 +2,8 @@
  ! See http://factorcode.org/license.txt for BSD license.
  USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
  combinators regexp.classes strings splitting peg locals accessors
-regexp.ast unicode.case ;
+regexp.ast unicode.case unicode.script.private unicode.categories
+memoize interval-maps sets unicode.data combinators.short-circuit ;
  IN: regexp.parser
  
  : allowed-char? ( ch -- ? )
@@ -18,15 +19,41 @@ ERROR: bad-number ;
  
  ERROR: bad-class name ;
  
+: simple ( str -- simple )
+    ! Alternatively, first collation key level?
+    >case-fold [ " \t_" member? not ] filter ;
+
+: simple-table ( seq -- table )
+    [ [ simple ] keep ] H{ } map>assoc ;
+
+MEMO: simple-script-table ( -- table )
+    script-table interval-values prune simple-table ;
+
+MEMO: simple-category-table ( -- table )
+    categories simple-table ;
+
  : parse-unicode-class ( name -- class )
-    ! Implement this!
-    drop f ;
+    {
+        { [ dup { [ length 1 = ] [ first "clmnpsz" member? ] } 1&& ] [
+            >upper first
+            <category-range-class>
+        ] }
+        { [ dup >title categories member? ] [
+            simple-category-table at <category-class>
+        ] }
+        { [ "script=" ?head ] [
+            dup simple-script-table at
+            [ <script-class> ]
+            [ "script=" prepend bad-class ] ?if
+        ] }
+        [ bad-class ]
+    } cond ;
  
  : unicode-class ( name -- class )
      dup parse-unicode-class [ ] [ bad-class ] ?if ;
  
  : name>class ( name -- class )
-    >string >case-fold {
+    >string simple {
          { "lower" letter-class }
          { "upper" LETTER-class }
          { "alpha" Letter-class }
diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor

index 6ad340a82ddbff38031e4de1f2f5b69385d4f887..6d9f03781d53490572ed9720557c7d923265aa6f 100644 (file)
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -72,8 +72,10 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
      { { $snippet "\\p{blank}" } "Non-newline whitespace" }
      { { $snippet "\\p{cntrl}" } "Control character" }
      { { $snippet "\\p{space}" } "Whitespace" }
-    { { $snippet "\\p{xdigit}" } "Hexidecimal digit" } } ! In the future: Unicode
-"Full unicode properties are not yet supported."
+    { { $snippet "\\p{xdigit}" } "Hexidecimal digit" }
+    { { $snippet "\\p{Nd}" } "Character in Unicode category Nd" } 
+    { { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" } 
+    { { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
  { $heading "Boundaries" }
  "Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
  { $table
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor

index 0836c0988b1a434efb880f7da3061ba2d6fb42ca..999caeaed63c1a5935ada2db98625b35754f8362 100644 (file)
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -480,3 +480,31 @@ IN: regexp-tests
  [ f ] [ "a\r" R/ a$./mds matches? ] unit-test
  [ t ] [ "a\n" R/ a$./ms matches? ] unit-test
  [ t ] [ "a\n" R/ a$./mds matches? ] unit-test
+
+! Unicode categories
+[ t ] [ "a" R/ \p{L}/ matches? ] unit-test
+[ t ] [ "A" R/ \p{L}/ matches? ] unit-test
+[ f ] [ " " R/ \p{L}/ matches? ] unit-test
+[ f ] [ "a" R/ \P{L}/ matches? ] unit-test
+[ f ] [ "A" R/ \P{L}/ matches? ] unit-test
+[ t ] [ " " R/ \P{L}/ matches? ] unit-test
+
+[ t ] [ "a" R/ \p{Ll}/ matches? ] unit-test
+[ f ] [ "A" R/ \p{Ll}/ matches? ] unit-test
+[ f ] [ " " R/ \p{Ll}/ matches? ] unit-test
+[ f ] [ "a" R/ \P{Ll}/ matches? ] unit-test
+[ t ] [ "A" R/ \P{Ll}/ matches? ] unit-test
+[ t ] [ " " R/ \P{Ll}/ matches? ] unit-test
+
+[ t ] [ "a" R/ \p{script=Latin}/ matches? ] unit-test
+[ f ] [ " " R/ \p{script=Latin}/ matches? ] unit-test
+[ f ] [ "a" R/ \P{script=Latin}/ matches? ] unit-test
+[ t ] [ " " R/ \P{script=Latin}/ matches? ] unit-test
+
+! These should be case-insensitive
+[ f ] [ " " R/ \p{l}/ matches? ] unit-test
+[ f ] [ "a" R/ \P{l}/ matches? ] unit-test
+[ f ] [ "a" R/ \P{ll}/ matches? ] unit-test
+[ t ] [ " " R/ \P{LL}/ matches? ] unit-test
+[ f ] [ "a" R/ \P{sCriPt = latin}/ matches? ] unit-test
+[ t ] [ " " R/ \P{SCRIPT = laTIn}/ matches? ] unit-test
author	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Sat, 21 Mar 2009 00:03:26 +0000 (19:03 -0500)
committer	Daniel Ehrenberg <littledan@Macintosh-122.local>
	Sat, 21 Mar 2009 00:03:26 +0000 (19:03 -0500)
basis/regexp/classes/classes.factor		patch \| blob \| history
basis/regexp/parser/parser.factor		patch \| blob \| history
basis/regexp/regexp-docs.factor		patch \| blob \| history
basis/regexp/regexp-tests.factor		patch \| blob \| history