: char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ;
-TUPLE: lookahead term positive? ;
+TUPLE: lookahead term ;
C: <lookahead> lookahead
-TUPLE: lookbehind term positive? ;
+TUPLE: lookbehind term ;
C: <lookbehind> lookbehind
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
-SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ;
+SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ;
TUPLE: range from to ;
C: <range> range
USING: regexp.classes kernel sequences regexp.negation
quotations assocs fry math locals combinators
accessors words compiler.units kernel.private strings
-sequences.private arrays call namespaces
+sequences.private arrays call namespaces unicode.breaks
regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler
<PRIVATE
M: t question>quot drop [ 2drop t ] ;
+M: f question>quot drop [ 2drop f ] ;
+
+M: not-class question>quot
+ class>> question>quot [ not ] compose ;
M: beginning-of-input question>quot
drop [ drop zero? ] ;
M: ^ question>quot
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
+M: word-break question>quot
+ drop [ word-break-at? ] ;
+
: (execution-quot) ( next-state -- quot )
! The conditions here are for lookaround and anchors, etc
dup condition? [
{ CHAR: z [ end-of-input <tagged-epsilon> ] }
{ CHAR: Z [ end-of-file <tagged-epsilon> ] }
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] }
+ { CHAR: b [ word-break <tagged-epsilon> ] }
+ { CHAR: B [ word-break <not-class> <tagged-epsilon> ] }
[ ]
} case ;
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
- | "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]]
- | "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]]
- | "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]]
- | "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]]
+ | "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
+ | "?!" Alternation:a => [[ a <lookahead> <not-class> <tagged-epsilon> ]]
+ | "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
+ | "?<!" Alternation:a => [[ a <lookbehind> <not-class> <tagged-epsilon> ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]
[ { "foo" "fxx" "fab" } ] [ "fab fxx foo" R/ f../r all-matches [ >string ] map ] unit-test
-! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
-! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
-! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
-! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
-
-! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
-! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
-! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
-! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
-! [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
-! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
-
-! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
-! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
-! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
-! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
-! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
-! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
+[ t ] [ "foo" "\\bfoo\\b" <regexp> re-contains? ] unit-test
+[ t ] [ "afoob" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
+[ f ] [ "afoob" "\\bfoo\\b" <regexp> re-contains? ] unit-test
+[ f ] [ "foo" "\\Bfoo\\B" <regexp> re-contains? ] unit-test
+
+[ 3 ] [ "foo bar" "foo\\b" <regexp> first-match length ] unit-test
+[ f ] [ "fooxbar" "foo\\b" <regexp> re-contains? ] unit-test
+[ t ] [ "foo" "foo\\b" <regexp> re-contains? ] unit-test
+[ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
+[ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
+[ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
+
+[ f ] [ "foo bar" "foo\\B" <regexp> re-contains? ] unit-test
+[ 3 ] [ "fooxbar" "foo\\B" <regexp> first-match length ] unit-test
+[ f ] [ "foo" "foo\\B" <regexp> re-contains? ] unit-test
+[ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
+[ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
+[ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
<PRIVATE
-: maybe-negated ( lookaround quot -- regexp-quot )
- '[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; inline
-
M: lookahead question>quot ! Returns ( index string -- ? )
- [ ast>dfa dfa>shortest-word '[ f _ execute ] ] maybe-negated ;
+ term>> ast>dfa dfa>shortest-word '[ f _ execute ] ;
: <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ;
M: lookbehind question>quot ! Returns ( index string -- ? )
- [
- <reversed-option>
- ast>dfa dfa>reverse-shortest-word
- '[ [ 1- ] dip f _ execute ]
- ] maybe-negated ;
+ term>> <reversed-option>
+ ast>dfa dfa>reverse-shortest-word
+ '[ [ 1- ] dip f _ execute ] ;
: check-string ( string -- string )
! Make this configurable
grapheme-break-test parse-test-file [ >graphemes ] test
word-break-test parse-test-file [ >words ] test
+
+[ { t f t t f t } ] [ 6 [ "as df" word-break-at? ] map ] unit-test
: >words ( str -- words )
[ first-word ] >pieces ;
+
+<PRIVATE
+
+: nth-next ( i str -- str[i-1] str[i] )
+ [ [ 1- ] keep ] dip '[ _ nth ] bi@ ;
+
+PRIVATE>
+
+: word-break-at? ( i str -- ? )
+ {
+ [ drop zero? ]
+ [ length = ]
+ [
+ [ nth-next [ word-break-prop ] dip ] 2keep
+ word-break-next nip
+ ]
+ } 2|| ;