Merge branch 'master' of git://factorcode.org/git/factor

author Doug Coleman <doug.coleman@gmail.com>

Wed, 27 Aug 2008 15:22:28 +0000 (10:22 -0500)

committer Doug Coleman <doug.coleman@gmail.com>

Wed, 27 Aug 2008 15:22:28 +0000 (10:22 -0500)
author Doug Coleman <doug.coleman@gmail.com>
Wed, 27 Aug 2008 15:22:28 +0000 (10:22 -0500)
committer Doug Coleman <doug.coleman@gmail.com>
Wed, 27 Aug 2008 15:22:28 +0000 (10:22 -0500)
diff --cc unfinished/regexp2/parser/parser.factor

index 0000000000000000000000000000000000000000,6eda3310d03ab98484488011088cc1bdf09edd59..39ca01e319a1368df50eabfef0f9fce7a585970c

mode 000000,100644..100644
--- /dev/null
--- 2/unfinished/regexp2/parser/parser.factor
+++ b/unfinished/regexp2/parser/parser.factor
@@@ -1,0 -1,391 +1,416 @@@
- -TUPLE: question term ; INSTANCE: question node
+ ! Copyright (C) 2008 Doug Coleman.
+ ! See http://factorcode.org/license.txt for BSD license.
+ USING: accessors arrays assocs combinators io io.streams.string
+ kernel math math.parser multi-methods namespaces qualified sets
+ quotations sequences sequences.lib splitting symbols vectors
+ dlists math.order combinators.lib unicode.categories strings
+ sequences.lib regexp2.backend regexp2.utils unicode.case ;
+ IN: regexp2.parser
+ 
+ FROM: math.ranges => [a,b] ;
+ 
+ MIXIN: node
+ TUPLE: concatenation seq ; INSTANCE: concatenation node
+ TUPLE: alternation seq ; INSTANCE: alternation node
+ TUPLE: kleene-star term ; INSTANCE: kleene-star node
- -TUPLE: independent-group term ; INSTANCE: independent-group node
++
++! !!!!!!!!
++TUPLE: possessive-question term ; INSTANCE: possessive-question node
++TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
++
++! !!!!!!!!
++TUPLE: reluctant-question term ; INSTANCE: reluctant-question node
++TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
++
+ TUPLE: negation term ; INSTANCE: negation node
+ TUPLE: constant char ; INSTANCE: constant node
+ TUPLE: range from to ; INSTANCE: range node
+ TUPLE: lookahead term ; INSTANCE: lookahead node
+ TUPLE: lookbehind term ; INSTANCE: lookbehind node
+ TUPLE: capture-group term ; INSTANCE: capture-group node
+ TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
- -DEFER: nested-parse-regexp
++TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
+ TUPLE: character-class-range from to ; INSTANCE: character-class-range node
+ SINGLETON: epsilon INSTANCE: epsilon node
+ SINGLETON: any-char INSTANCE: any-char node
+ SINGLETON: front-anchor INSTANCE: front-anchor node
+ SINGLETON: back-anchor INSTANCE: back-anchor node
+ 
+ TUPLE: option-on option ; INSTANCE: option-on node
+ TUPLE: option-off option ; INSTANCE: option-off node
+ SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
+ 
+ SINGLETONS: letter-class LETTER-class Letter-class digit-class
+ alpha-class non-newline-blank-class
+ ascii-class punctuation-class java-printable-class blank-class
+ control-character-class hex-digit-class java-blank-class c-identifier-class
+ unmatchable-class ;
+ 
+ SINGLETONS: beginning-of-group end-of-group
+ beginning-of-character-class end-of-character-class
+ left-parenthesis pipe caret dash ;
+ 
+ : get-option ( option -- ? ) current-regexp get options>> at ;
+ : get-unix-lines ( -- ? ) unix-lines get-option ;
+ : get-dotall ( -- ? ) dotall get-option ;
+ : get-multiline ( -- ? ) multiline get-option ;
+ : get-comments ( -- ? ) comments get-option ;
+ : get-case-insensitive ( -- ? ) case-insensitive get-option ;
+ : get-unicode-case ( -- ? ) unicode-case get-option ;
+ : get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
+ 
++: <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
++: <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
++: <possessive-question> ( obj -- kleene ) possessive-question boa ;
++: <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
++
+ : <negation> ( obj -- negation ) negation boa ;
+ : <concatenation> ( seq -- concatenation )
+     >vector get-reversed-regexp [ reverse ] when
+     concatenation boa ;
+ : <alternation> ( seq -- alternation ) >vector alternation boa ;
+ : <capture-group> ( obj -- capture-group ) capture-group boa ;
+ : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
+ : <constant> ( obj -- constant )
+     dup Letter? get-case-insensitive and [
+         [ ch>lower constant boa ]
+         [ ch>upper constant boa ] bi 2array <alternation>
+     ] [
+         constant boa
+     ] if ;
+ 
+ : first|concatenation ( seq -- first/concatenation )
+     dup length 1 = [ first ] [ <concatenation> ] if ;
+ 
+ : first|alternation ( seq -- first/alternation )
+     dup length 1 = [ first ] [ <alternation> ] if ;
+ 
+ : <character-class-range> ( from to -- obj )
+     2dup [ Letter? ] bi@ or get-case-insensitive and [
+         [ [ ch>lower ] bi@ character-class-range boa ]
+         [ [ ch>upper ] bi@ character-class-range boa ] 2bi
+         2array [ [ from>> ] [ to>> ] bi < ] filter
+         [ unmatchable-class ] [ first|alternation ] if-empty
+     ] [
+         2dup <
+         [ character-class-range boa ] [ 2drop unmatchable-class ] if
+     ] if ;
+ 
+ ERROR: unmatched-parentheses ;
+ 
+ : make-positive-lookahead ( string -- )
+     lookahead boa push-stack ;
+ 
+ : make-negative-lookahead ( string -- )
+     <negation> lookahead boa push-stack ;
+ 
+ : make-independent-group ( string -- )
+     #! no backtracking
+     independent-group boa push-stack ;
+ 
+ : make-positive-lookbehind ( string -- )
+     lookbehind boa push-stack ;
+ 
+ : make-negative-lookbehind ( string -- )
+     <negation> lookbehind boa push-stack ;
+ 
- -            [ drop read1 drop nested-parse-regexp pop-stack make-positive-lookbehind ] }
+ : make-non-capturing-group ( string -- )
+     non-capture-group boa push-stack ;
+ 
+ ERROR: bad-option ch ;
+ 
+ : option ( ch -- singleton )
+     {
+         { CHAR: i [ case-insensitive ] }
+         { CHAR: d [ unix-lines ] }
+         { CHAR: m [ multiline ] }
++        { CHAR: n [ multiline ] }
+         { CHAR: r [ reversed-regexp ] }
+         { CHAR: s [ dotall ] }
+         { CHAR: u [ unicode-case ] }
+         { CHAR: x [ comments ] }
+         [ bad-option ]
+     } case ;
+ 
+ : option-on ( option -- ) current-regexp get options>> conjoin ;
+ : option-off ( option -- ) current-regexp get options>> delete-at ;
+ 
+ : toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
+ : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
+ 
+ : parse-options ( string -- )
+     "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
+ 
+ DEFER: (parse-regexp)
+ : parse-special-group ( -- )
+     beginning-of-group push-stack
+     (parse-regexp) pop-stack make-non-capturing-group ;
+ 
+ ERROR: bad-special-group string ;
+ 
++DEFER: nested-parse-regexp
+ : (parse-special-group) ( -- )
+     read1 {
+         { [ dup CHAR: : = ]
+             [ drop nested-parse-regexp pop-stack make-non-capturing-group ] }
+         { [ dup CHAR: = = ]
+             [ drop nested-parse-regexp pop-stack make-positive-lookahead ] }
+         { [ dup CHAR: = = ]
+             [ drop nested-parse-regexp pop-stack make-negative-lookahead ] }
+         { [ dup CHAR: > = ]
+             [ drop nested-parse-regexp pop-stack make-independent-group ] }
+         { [ dup CHAR: < = peek1 CHAR: = = and ]
- -            [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] }
++            [ drop drop1 nested-parse-regexp pop-stack make-positive-lookbehind ] }
+         { [ dup CHAR: < = peek1 CHAR: ! = and ]
- -    [ read1 drop (parse-special-group) ]
++            [ drop drop1 nested-parse-regexp pop-stack make-negative-lookbehind ] }
+         [
+             ":)" read-until
+             [ swap prefix ] dip
+             {
+                 { CHAR: : [ parse-options parse-special-group ] }
+                 { CHAR: ) [ parse-options ] }
+                 [ drop bad-special-group ]
+             } case
+         ]
+     } cond ;
+ 
+ : handle-left-parenthesis ( -- )
+     peek1 CHAR: ? =
- -: handle-star ( -- ) stack pop <kleene-star> push-stack ;
++    [ drop1 (parse-special-group) ]
+     [ nested-parse-regexp ] if ;
+ 
+ : handle-dot ( -- ) any-char push-stack ;
+ : handle-pipe ( -- ) pipe push-stack ;
- -    stack pop epsilon 2array <alternation> push-stack ;
++: (handle-star) ( obj -- kleene-star )
++    peek1 {
++        { CHAR: + [ drop1 <possessive-kleene-star> ] }
++        { CHAR: ? [ drop1 <reluctant-kleene-star> ] }
++        [ drop <kleene-star> ]
++    } case ;
++: handle-star ( -- ) stack pop (handle-star) push-stack ;
+ : handle-question ( -- )
- -    stack pop dup <kleene-star> 2array <concatenation> push-stack ;
++    stack pop peek1 {
++        { CHAR: + [ drop1 <possessive-question> ] }
++        { CHAR: ? [ drop1 <reluctant-question> ] }
++        [ drop epsilon 2array <alternation> ]
++    } case push-stack ;
+ : handle-plus ( -- )
- -    read1 drop
++    stack pop dup (handle-star)
++    2array <concatenation> push-stack ;
+ 
+ ERROR: unmatched-brace ;
+ : parse-repetition ( -- start finish ? )
+     "}" read-until [ unmatched-brace ] unless
+     [ "," split1 [ string>number ] bi@ ]
+     [ CHAR: , swap index >boolean ] bi ;
+ 
+ : replicate/concatenate ( n obj -- obj' )
+     over zero? [ 2drop epsilon ]
+     [ <repetition> first|concatenation ] if ;
+ 
+ : exactly-n ( n -- )
+     stack pop replicate/concatenate push-stack ;
+ 
+ : at-least-n ( n -- )
+     stack pop
+     [ replicate/concatenate ] keep
+     <kleene-star> 2array <concatenation> push-stack ;
+ 
+ : at-most-n ( n -- )
+     1+
+     stack pop
+     [ replicate/concatenate ] curry map <alternation> push-stack ;
+ 
+ : from-m-to-n ( m n -- )
+     [a,b]
+     stack pop
+     [ replicate/concatenate ] curry map
+     <alternation> push-stack ;
+ 
+ ERROR: invalid-range a b ;
+ 
+ : handle-left-brace ( -- )
+     parse-repetition
+     >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r>
+     [
+         2dup and [ from-m-to-n ]
+         [ [ nip at-most-n ] [ at-least-n ] if* ] if
+     ] [ drop 0 max exactly-n ] if ;
+ 
+ : handle-front-anchor ( -- ) front-anchor push-stack ;
+ : handle-back-anchor ( -- ) back-anchor push-stack ;
+ 
+ ERROR: bad-character-class obj ;
+ ERROR: expected-posix-class ;
+ 
+ : parse-posix-class ( -- obj )
+     read1 CHAR: { = [ expected-posix-class ] unless
+     "}" read-until [ bad-character-class ] unless
+     {
+         { "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
+         { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
+         { "Alpha" [ Letter-class ] }
+         { "ASCII" [ ascii-class ] }
+         { "Digit" [ digit-class ] }
+         { "Alnum" [ alpha-class ] }
+         { "Punct" [ punctuation-class ] }
+         { "Graph" [ java-printable-class ] }
+         { "Print" [ java-printable-class ] }
+         { "Blank" [ non-newline-blank-class ] }
+         { "Cntrl" [ control-character-class ] }
+         { "XDigit" [ hex-digit-class ] }
+         { "Space" [ java-blank-class ] }
+         ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
+         [ bad-character-class ]
+     } case ;
+ 
+ : parse-octal ( -- n ) 3 read oct> check-octal ;
+ : parse-short-hex ( -- n ) 2 read hex> check-hex ;
+ : parse-long-hex ( -- n ) 6 read hex> check-hex ;
+ : parse-control-character ( -- n ) read1 ;
+ 
+ ERROR: bad-escaped-literals seq ;
+ : parse-escaped-literals ( -- obj )
+     "\\E" read-until [ bad-escaped-literals ] unless
++    drop1
+     [ epsilon ] [
+         [ <constant> ] V{ } map-as
+         first|concatenation
+     ] if-empty ;
+ 
+ : parse-escaped ( -- obj )
+     read1
+     {
+         { CHAR: \ [ CHAR: \ <constant> ] }
+         { CHAR: . [ CHAR: . <constant> ] }
+         { CHAR: t [ CHAR: \t <constant> ] }
+         { CHAR: n [ CHAR: \n <constant> ] }
+         { CHAR: r [ CHAR: \r <constant> ] }
+         { CHAR: f [ HEX: c <constant> ] }
+         { CHAR: a [ HEX: 7 <constant> ] }
+         { CHAR: e [ HEX: 1b <constant> ] }
+ 
+         { CHAR: d [ digit-class ] }
+         { CHAR: D [ digit-class <negation> ] }
+         { CHAR: s [ java-blank-class ] }
+         { CHAR: S [ java-blank-class <negation> ] }
+         { CHAR: w [ c-identifier-class ] }
+         { CHAR: W [ c-identifier-class <negation> ] }
+ 
+         { CHAR: p [ parse-posix-class ] }
+         { CHAR: P [ parse-posix-class <negation> ] }
+         { CHAR: x [ parse-short-hex <constant> ] }
+         { CHAR: u [ parse-long-hex <constant> ] }
+         { CHAR: 0 [ parse-octal <constant> ] }
+         { CHAR: c [ parse-control-character ] }
+ 
+         ! { CHAR: b [ handle-word-boundary ] }
+         ! { CHAR: B [ handle-word-boundary <negation> ] }
+         ! { CHAR: A [ handle-beginning-of-input ] }
+         ! { CHAR: G [ end of previous match ] }
+         ! { CHAR: Z [ handle-end-of-input ] }
+         ! { CHAR: z [ handle-end-of-input ] } ! except for terminator
+ 
+         { CHAR: Q [ parse-escaped-literals ] }
+     } case ;
+ 
+ : handle-escape ( -- ) parse-escaped push-stack ;
+ 
+ : handle-dash ( vector -- vector' )
+     H{ { dash CHAR: - } } substitute ;
+ 
+ : character-class>alternation ( seq -- alternation )
+     [ dup number? [ <constant> ] when ] map first|alternation ;
+ 
+ : handle-caret ( vector -- vector' )
+     dup [ length 2 >= ] [ first caret eq? ] bi and [
+         rest-slice character-class>alternation <negation>
+     ] [
+         character-class>alternation
+     ] if ;
+ 
+ : make-character-class ( -- character-class )
+     [ beginning-of-character-class swap cut-stack ] change-whole-stack
+     handle-dash handle-caret ;
+ 
+ : apply-dash ( -- )
+     stack [ pop3 nip <character-class-range> ] keep push ;
+ 
+ : apply-dash? ( -- ? )
+     stack dup length 3 >=
+     [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
+ 
+ ERROR: empty-negated-character-class ;
+ DEFER: handle-left-bracket
+ : (parse-character-class) ( -- )
+     read1 [ empty-negated-character-class ] unless* {
+         { CHAR: [ [ handle-left-bracket t ] }
+         { CHAR: ] [ make-character-class push-stack f ] }
+         { CHAR: - [ dash push-stack t ] }
+         { CHAR: \ [ parse-escaped push-stack t ] }
+         [ push-stack apply-dash? [ apply-dash ] when t ]
+     } case
+     [ (parse-character-class) ] when ;
+ 
+ : parse-character-class-second ( -- )
+     read1 {
+         { CHAR: [ [ CHAR: [ <constant> push-stack ] }
+         { CHAR: ] [ CHAR: ] <constant> push-stack ] }
+         { CHAR: - [ CHAR: - <constant> push-stack ] }
+         [ push1 ]
+     } case ;
+ 
+ : parse-character-class-first ( -- )
+     read1 {
+         { CHAR: ^ [ caret push-stack parse-character-class-second ] }
+         { CHAR: [ [ CHAR: [ <constant> push-stack ] }
+         { CHAR: ] [ CHAR: ] <constant> push-stack ] }
+         { CHAR: - [ CHAR: - <constant> push-stack ] }
+         [ push1 ]
+     } case ;
+ 
+ : handle-left-bracket ( -- )
+     beginning-of-character-class push-stack
+     parse-character-class-first (parse-character-class) ;
+ 
+ : finish-regexp-parse ( stack -- obj )
+     { pipe } split
+     [ first|concatenation ] map first|alternation ;
+ 
+ : handle-right-parenthesis ( -- )
+     stack beginning-of-group over last-index cut rest
+     [ current-regexp get swap >>stack drop ]
+     [ finish-regexp-parse <capture-group> push-stack ] bi* ;
+ 
+ : nested-parse-regexp ( -- )
+     beginning-of-group push-stack (parse-regexp) ;
+ 
+ : ((parse-regexp)) ( token -- )
+     {
+         { CHAR: . [ handle-dot ] }
+         { CHAR: ( [ handle-left-parenthesis ] }
+         { CHAR: ) [ handle-right-parenthesis ] }
+         { CHAR: | [ handle-pipe ] }
+         { CHAR: ? [ handle-question ] }
+         { CHAR: * [ handle-star ] }
+         { CHAR: + [ handle-plus ] }
+         { CHAR: { [ handle-left-brace ] }
+         { CHAR: [ [ handle-left-bracket ] }
+         { CHAR: ^ [ handle-front-anchor ] }
+         { CHAR: $ [ handle-back-anchor ] }
+         { CHAR: \ [ handle-escape ] }
+         [ <constant> push-stack ]
+     } case ;
+ 
+ : (parse-regexp) ( -- )
+     read1 [ ((parse-regexp)) (parse-regexp) ] when* ;
+ 
+ : parse-regexp ( regexp -- )
+     dup current-regexp [
+         raw>> [
+             <string-reader> [ (parse-regexp) ] with-input-stream
+         ] unless-empty
+         current-regexp get
+         stack finish-regexp-parse
+             >>parse-tree drop
+     ] with-variable ;
diff --cc unfinished/regexp2/regexp2-tests.factor

index 0000000000000000000000000000000000000000,54626ea165054b90d6c80d721bf240928b6baea1..2bb194f01202b42301af0b5844835b3b34a0e394

mode 000000,100644..100644
--- /dev/null
--- 2/unfinished/regexp2/regexp2-tests.factor
+++ b/unfinished/regexp2/regexp2-tests.factor
@@@ -1,0 -1,263 +1,267 @@@
- -[ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
- -[ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
+ USING: regexp2 tools.test kernel sequences regexp2.parser
+ regexp2.traversal ;
+ IN: regexp2-tests
+ 
+ [ f ] [ "b" "a*" <regexp> matches? ] unit-test
+ [ t ] [ "" "a*" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a*" <regexp> matches? ] unit-test
+ [ t ] [ "aaaaaaa" "a*"  <regexp> matches? ] unit-test
+ [ f ] [ "ab" "a*" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "abc" "abc" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a|b|c" <regexp> matches? ] unit-test
+ [ t ] [ "b" "a|b|c" <regexp> matches? ] unit-test
+ [ t ] [ "c" "a|b|c" <regexp> matches? ] unit-test
+ [ f ] [ "c" "d|e|f" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "aa" "a|b|c" <regexp> matches? ] unit-test
+ [ f ] [ "bb" "a|b|c" <regexp> matches? ] unit-test
+ [ f ] [ "cc" "a|b|c" <regexp> matches? ] unit-test
+ [ f ] [ "cc" "d|e|f" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "a+" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a+" <regexp> matches? ] unit-test
+ [ t ] [ "aa" "a+" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "" "a?" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a?" <regexp> matches? ] unit-test
+ [ f ] [ "aa" "a?" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "." <regexp> matches? ] unit-test
+ [ t ] [ "a" "." <regexp> matches? ] unit-test
+ [ t ] [ "." "." <regexp> matches? ] unit-test
+ ! [ f ] [ "\n" "." <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" ".+" <regexp> matches? ] unit-test
+ [ t ] [ "a" ".+" <regexp> matches? ] unit-test
+ [ t ] [ "ab" ".+" <regexp> matches? ] unit-test
+ 
+ 
+ [ t ] [ "" "a|b*|c+|d?" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a|b*|c+|d?" <regexp> matches? ] unit-test
+ [ t ] [ "c" "a|b*|c+|d?" <regexp> matches? ] unit-test
+ [ t ] [ "cc" "a|b*|c+|d?" <regexp> matches? ] unit-test
+ [ f ] [ "ccd" "a|b*|c+|d?" <regexp> matches? ] unit-test
+ [ t ] [ "d" "a|b*|c+|d?" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "foo" "foo|bar" <regexp> matches? ] unit-test
+ [ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
+ [ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "(a)" <regexp> matches? ] unit-test
+ [ t ] [ "a" "(a)" <regexp> matches? ] unit-test
+ [ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
+ [ t ] [ "aa" "(a*)" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
+ [ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "a{1}" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
+ [ f ] [ "aa" "a{1}" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "a" "a{2,}" <regexp> matches? ] unit-test
+ [ t ] [ "aaa" "a{2,}" <regexp> matches? ] unit-test
+ [ t ] [ "aaaa" "a{2,}" <regexp> matches? ] unit-test
+ [ t ] [ "aaaaa" "a{2,}" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "" "a{,2}" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a{,2}" <regexp> matches? ] unit-test
+ [ t ] [ "aa" "a{,2}" <regexp> matches? ] unit-test
+ [ f ] [ "aaa" "a{,2}" <regexp> matches? ] unit-test
+ [ f ] [ "aaaa" "a{,2}" <regexp> matches? ] unit-test
+ [ f ] [ "aaaaa" "a{,2}" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "a{1,3}" <regexp> matches? ] unit-test
+ [ t ] [ "a" "a{1,3}" <regexp> matches? ] unit-test
+ [ t ] [ "aa" "a{1,3}" <regexp> matches? ] unit-test
+ [ t ] [ "aaa" "a{1,3}" <regexp> matches? ] unit-test
+ [ f ] [ "aaaa" "a{1,3}" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "[a]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[a]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[abc]" <regexp> matches? ] unit-test
+ [ f ] [ "b" "[a]" <regexp> matches? ] unit-test
+ [ f ] [ "d" "[abc]" <regexp> matches? ] unit-test
+ [ t ] [ "ab" "[abc]{1,2}" <regexp> matches? ] unit-test
+ [ f ] [ "abc" "[abc]{1,2}" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "[^a]" <regexp> matches? ] unit-test
+ [ f ] [ "a" "[^a]" <regexp> matches? ] unit-test
+ [ f ] [ "a" "[^abc]" <regexp> matches? ] unit-test
+ [ t ] [ "b" "[^a]" <regexp> matches? ] unit-test
+ [ t ] [ "d" "[^abc]" <regexp> matches? ] unit-test
+ [ f ] [ "ab" "[^abc]{1,2}" <regexp> matches? ] unit-test
+ [ f ] [ "abc" "[^abc]{1,2}" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "]" "[]]" <regexp> matches? ] unit-test
+ [ f ] [ "]" "[^]]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[^]]" <regexp> matches? ] unit-test
+ 
+ [ "^" "[^]" <regexp> matches? ] must-fail
+ [ t ] [ "^" "[]^]" <regexp> matches? ] unit-test
+ [ t ] [ "]" "[]^]" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "[" "[[]" <regexp> matches? ] unit-test
+ [ f ] [ "^" "[^^]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[^^]" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "-" "[-]" <regexp> matches? ] unit-test
+ [ f ] [ "a" "[-]" <regexp> matches? ] unit-test
+ [ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "-" "[-a]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[-a]" <regexp> matches? ] unit-test
+ [ t ] [ "-" "[a-]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[a-]" <regexp> matches? ] unit-test
+ [ f ] [ "b" "[a-]" <regexp> matches? ] unit-test
+ [ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "-" "[a-c]" <regexp> matches? ] unit-test
+ [ t ] [ "-" "[^a-c]" <regexp> matches? ] unit-test
+ [ t ] [ "b" "[a-c]" <regexp> matches? ] unit-test
+ [ f ] [ "b" "[^a-c]" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "-" "[a-c-]" <regexp> matches? ] unit-test
+ [ f ] [ "-" "[^a-c-]" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "\\" "[\\\\]" <regexp> matches? ] unit-test
+ [ f ] [ "a" "[\\\\]" <regexp> matches? ] unit-test
+ [ f ] [ "\\" "[^\\\\]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[^\\\\]" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "0" "[\\d]" <regexp> matches? ] unit-test
+ [ f ] [ "a" "[\\d]" <regexp> matches? ] unit-test
+ [ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
+ [ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
+ [ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "abc" "\\p{Lower}{3}" <regexp> matches? ] unit-test
+ [ f ] [ "ABC" "\\p{Lower}{3}" <regexp> matches? ] unit-test
+ [ t ] [ "ABC" "\\p{Upper}{3}" <regexp> matches? ] unit-test
+ [ f ] [ "abc" "\\p{Upper}{3}" <regexp> matches? ] unit-test
+ ! 
+ [ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
+ [ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
+ [ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
+ [ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
+ [ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
+ [ t ] [ "s" "\\Qs\\E" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "S" "\\0123" <regexp> matches? ] unit-test
+ [ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
+ [ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
+ [ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
+ [ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
+ [ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
+ [ f ] [ "b" "a+b" <regexp> matches? ] unit-test
+ [ t ] [ "aab" "a+b" <regexp> matches? ] unit-test
+ [ f ] [ "abb" "a+b" <regexp> matches? ] unit-test
+ 
+ [ t ] [ "abbbb" "ab*" <regexp> matches? ] unit-test
+ [ t ] [ "a" "ab*" <regexp> matches? ] unit-test
+ [ f ] [ "abab" "ab*" <regexp> matches? ] unit-test
+ 
+ [ f ] [ "x" "\\." <regexp> matches? ] unit-test
+ [ t ] [ "." "\\." <regexp> matches? ] unit-test
+ 
+ [ t ] [ "aaaab" "a+ab" <regexp> matches? ] unit-test
+ [ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
+ [ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
- -[ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
+ 
+ [ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
- -[ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
- -[ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
- -[ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
+ [ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
- -! [ t ] [ "aaa" "AAA" t <regexp> matches? ] unit-test
- -! [ f ] [ "aax" "AAA" t <regexp> matches? ] unit-test
- -! [ t ] [ "aaa" "A*" t <regexp> matches? ] unit-test
- -! [ f ] [ "aaba" "A*" t <regexp> matches? ] unit-test
- -! [ t ] [ "b" "[AB]" t <regexp> matches? ] unit-test
- -! [ f ] [ "c" "[AB]" t <regexp> matches? ] unit-test
- -! [ t ] [ "c" "[A-Z]" t <regexp> matches? ] unit-test
- -! [ f ] [ "3" "[A-Z]" t <regexp> matches? ] unit-test
+ 
- -! [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
- -! [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
++[ t ] [ "aaa" "AAA" <iregexp> matches? ] unit-test
++[ f ] [ "aax" "AAA" <iregexp> matches? ] unit-test
++[ t ] [ "aaa" "A*" <iregexp> matches? ] unit-test
++[ f ] [ "aaba" "A*" <iregexp> matches? ] unit-test
++[ t ] [ "b" "[AB]" <iregexp> matches? ] unit-test
++[ f ] [ "c" "[AB]" <iregexp> matches? ] unit-test
++[ t ] [ "c" "[A-Z]" <iregexp> matches? ] unit-test
++[ f ] [ "3" "[A-Z]" <iregexp> matches? ] unit-test
++
++[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
++[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
++[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
++[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
++
++[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
++[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
++[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
++[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
++
++[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
++[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
++
++[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
++[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
++
++[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
++[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
++[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
++
++[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
++[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
++[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
+ 
+ [ ] [
+     "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
+     <regexp> drop
+ ] unit-test
+ 
++
++
++
+ [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
+ 
++[ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
++[ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
++[ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
++[ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
++[ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
++[ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
++
+ [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
+ [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
+ 
- -[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
- -[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
- -[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
- -
- -[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
- -[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
- -[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
- -[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
- -
- -[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
- -[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
- -[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
- -[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
- -
- -[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
- -[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
- -
- -[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
- -[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
- -
- -[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
- -[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
- -[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
- -
++[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
++[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
+ 
+ ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
+ ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
+ 
+ ! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
+ ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
+ ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
+ ! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
+ ! [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
+ ! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
+ 
+ ! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
+ ! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test
+ ! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
+ ! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
+ ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
+ ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
+ 
+ ! Bug in parsing word
+ ! [ t ] [ "a" R' a' matches?  ] unit-test
+ 
+ ! ((A)(B(C)))
+ ! 1.  ((A)(B(C)))
+ ! 2. (A)
+ ! 3. (B(C))
+ ! 4. (C) 
diff --cc unfinished/regexp2/utils/utils.factor

index 0000000000000000000000000000000000000000,0167e730053d27f1b3043adcd390ba98fd2a1f5d..a7606e0af33acddfdc1fe21522648c0a4937f5e4

mode 000000,100644..100644
--- /dev/null
--- 2/unfinished/regexp2/utils/utils.factor
+++ b/unfinished/regexp2/utils/utils.factor
@@@ -1,0 -1,69 +1,70 @@@
+ ! Copyright (C) 2008 Doug Coleman.
+ ! See http://factorcode.org/license.txt for BSD license.
+ USING: accessors arrays assocs combinators.lib io kernel
+ math math.order namespaces regexp2.backend sequences
+ sequences.lib unicode.categories math.ranges fry
+ combinators.short-circuit ;
+ IN: regexp2.utils
+ 
+ : (while-changes) ( obj quot pred pred-ret -- obj )
+     ! quot: ( obj -- obj' )
+     ! pred: ( obj -- <=> )
+     >r >r dup slip r> pick over call r> dupd =
+     [ 3drop ] [ (while-changes) ] if ; inline
+ 
+ : while-changes ( obj quot pred -- obj' )
+     pick over call (while-changes) ; inline
+ 
+ : last-state ( regexp -- range ) stack>> peek first2 [a,b] ;
+ : push1 ( obj -- ) input-stream get stream>> push ;
+ : peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
+ : pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
++: drop1 ( -- ) read1 drop ;
+ 
+ : stack ( -- obj ) current-regexp get stack>> ;
+ : change-whole-stack ( quot -- )
+     current-regexp get
+     [ stack>> swap call ] keep (>>stack) ; inline
+ : push-stack ( obj -- ) stack push ;
+ : pop-stack ( -- obj ) stack pop ;
+ : cut-out ( vector n -- vector' vector ) cut rest ;
+ ERROR: cut-stack-error ;
+ : cut-stack ( obj vector -- vector' vector )
+     tuck last-index [ cut-stack-error ] unless* cut-out swap ;
+ 
+ ERROR: bad-octal number ;
+ ERROR: bad-hex number ;
+ : check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
+ : check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
+ 
+ : ascii? ( n -- ? ) 0 HEX: 7f between? ;
+ : octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
+ : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
+ 
+ : hex-digit? ( n -- ? )
+     [
+         [ decimal-digit? ]
+         [ CHAR: a CHAR: f between? ]
+         [ CHAR: A CHAR: F between? ]
+     ] 1|| ;
+ 
+ : control-char? ( n -- ? )
+     [
+         [ 0 HEX: 1f between? ]
+         [ HEX: 7f = ]
+     ] 1|| ;
+ 
+ : punct? ( n -- ? )
+     "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
+ 
+ : c-identifier-char? ( ch -- ? )
+     [ [ alpha? ] [ CHAR: _ = ] ] 1|| ;
+ 
+ : java-blank? ( n -- ? )
+     {
+         CHAR: \s CHAR: \t CHAR: \n
+         HEX: b HEX: 7 CHAR: \r
+     } member? ;
+ 
+ : java-printable? ( n -- ? )
+     [ [ alpha? ] [ punct? ] ] 1|| ;
author	Doug Coleman <doug.coleman@gmail.com>
	Wed, 27 Aug 2008 15:22:28 +0000 (10:22 -0500)
committer	Doug Coleman <doug.coleman@gmail.com>
	Wed, 27 Aug 2008 15:22:28 +0000 (10:22 -0500)
		1	2
unfinished/regexp2/parser/parser.factor	patch \|	\|	diff2 \|	blob \| history
unfinished/regexp2/regexp2-tests.factor	patch \|	\|	diff2 \|	blob \| history
unfinished/regexp2/utils/utils.factor	patch \|	\|	diff2 \|	blob \| history