TUPLE: at-least n ;
C: <at-least> at-least
-SINGLETON: epsilon
+TUPLE: tagged-epsilon tag ;
+C: <tagged-epsilon> tagged-epsilon
+
+CONSTANT: epsilon T{ tagged-epsilon }
TUPLE: concatenation first second ;
TUPLE: lookbehind term ;
C: <lookbehind> lookbehind
+
+TUPLE: possessive-star term ;
+C: <possessive-star> possessive-star
+
+: <possessive-plus> ( term -- term' )
+ dup <possessive-star> 2array <concatenation> ;
+
control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ;
-SINGLETONS: beginning-of-input beginning-of-line
-end-of-input end-of-line ;
+SINGLETONS: beginning-of-input ^ end-of-input $ ;
TUPLE: range from to ;
C: <range> range
M: terminator-class class-member? ( obj class -- ? )
drop "\r\n\u000085\u002029\u002028" member? ;
-M: beginning-of-line class-member? ( obj class -- ? )
+M: ^ class-member? ( obj class -- ? )
2drop f ;
-M: end-of-line class-member? ( obj class -- ? )
+M: $ class-member? ( obj class -- ? )
2drop f ;
M: f class-member? 2drop f ;
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.combinators tools.test regexp kernel sequences ;
+IN: regexp.combinators.tests
+
+: strings ( -- regexp )
+ { "foo" "bar" "baz" } <any-of> ;
+
+[ t t t ] [ "foo" "bar" "baz" [ strings matches? ] tri@ ] unit-test
+[ f f f ] [ "food" "ibar" "ba" [ strings matches? ] tri@ ] unit-test
+
+: conj ( -- regexp )
+ { R/ .*a/ R/ b.*/ } <and> ;
+
+[ t ] [ "bljhasflsda" conj matches? ] unit-test
+[ f ] [ "bsdfdfs" conj matches? ] unit-test ! why does this fail?
+[ f ] [ "fsfa" conj matches? ] unit-test
+
+! For some reason, creating this DFA doesn't work
+! [ f ] [ "bljhasflsda" conj <not> matches? ] unit-test
+! [ t ] [ "bsdfdfs" conj <not> matches? ] unit-test
+! [ t ] [ "fsfa" conj <not> matches? ] unit-test
+
+[ f f ] [ "" "hi" [ <nothing> matches? ] bi@ ] unit-test
+[ t t ] [ "" "hi" [ <nothing> <not> matches? ] bi@ ] unit-test
+
+[ { t t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <zero-or-more> matches? ] map ] unit-test
+[ { f t t f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <one-or-more> matches? ] map ] unit-test
+[ { t t f f } ] [ { "" "a" "aaaaa" "aab" } [ "a" <literal> <option> matches? ] map ] unit-test
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp sequences kernel regexp.negation regexp.ast
+accessors fry ;
+IN: regexp.combinators
+
+: <nothing> ( -- regexp )
+ R/ (?~.*)/ ;
+
+: <literal> ( string -- regexp )
+ [ "\\Q" "\\E" surround ] [ <concatenation> ] bi make-regexp ;
+
+: <or> ( regexps -- disjunction )
+ [ [ raw>> "(" ")" surround ] map "|" join ]
+ [ [ parse-tree>> ] map <alternation> ] bi
+ make-regexp ;
+
+: <any-of> ( strings -- regexp )
+ [ <literal> ] map <or> ;
+
+: <sequence> ( regexps -- regexp )
+ [ [ raw>> ] map concat ]
+ [ [ parse-tree>> ] map <concatenation> ] bi
+ make-regexp ;
+
+: modify-regexp ( regexp raw-quot tree-quot -- new-regexp )
+ [ '[ raw>> @ ] ]
+ [ '[ parse-tree>> @ ] ] bi* bi
+ make-regexp ; inline
+
+: <not> ( regexp -- not-regexp )
+ [ "(?~" ")" surround ]
+ [ <negation> ] modify-regexp ;
+
+: <and> ( regexps -- conjunction )
+ [ <not> ] map <or> <not> ;
+
+: <zero-or-more> ( regexp -- regexp* )
+ [ "(" ")*" surround ]
+ [ <star> ] modify-regexp ;
+
+: <one-or-more> ( regexp -- regexp+ )
+ [ "(" ")+" surround ]
+ [ <plus> ] modify-regexp ;
+
+: <option> ( regexp -- regexp? )
+ [ "(" ")?" surround ]
+ [ <maybe> ] modify-regexp ;
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs combinators fry kernel locals
math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors ;
+sets sorting vectors regexp.ast ;
IN: regexp.dfa
:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj )
transitions>> '[ _ swap _ at at ] gather sift ;
: (find-epsilon-closure) ( states nfa -- new-states )
- eps swap find-delta ;
+ epsilon swap find-delta ;
: find-epsilon-closure ( states nfa -- new-states )
'[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
: find-transitions ( dfa-state nfa -- next-dfa-state )
transitions>>
'[ _ at keys ] gather
- eps swap remove ;
+ epsilon swap remove ;
: add-todo-state ( state visited-states new-states -- )
3dup drop key? [ 3drop ] [
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.nfa regexp.disambiguate kernel sequences
assocs regexp.classes hashtables accessors fry vectors
-regexp.ast regexp.transition-tables regexp.minimize ;
+regexp.ast regexp.transition-tables regexp.minimize namespaces ;
IN: regexp.negation
: ast>dfa ( parse-tree -- minimal-dfa )
: unify-final-state ( transition-table -- transition-table )
dup [ final-states>> keys ] keep
- '[ -2 eps <literal-transition> _ add-transition ] each
+ '[ -2 epsilon <literal-transition> _ add-transition ] each
H{ { -2 -2 } } >>final-states ;
: adjoin-dfa ( transition-table -- start end )
box-transitions unify-final-state renumber-states
[ start-state>> ]
[ final-states>> keys first ]
- [ table [ transitions>> ] bi@ swap update ] tri ;
+ [ nfa-table get [ transitions>> ] bi@ swap update ] tri ;
M: negation nfa-node ( node -- start end )
term>> ast>dfa negate-table adjoin-dfa ;
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations
M: concatenation remove-lookahead ;
-SINGLETON: eps
-
SYMBOL: option-stack
SYMBOL: state
state [ get ] [ inc ] bi ;
SYMBOL: nfa-table
-: table ( -- table ) nfa-table get ;
: set-each ( keys value hashtable -- )
'[ _ swap _ set-at ] each ;
: add-simple-entry ( obj class -- start-state end-state )
[ next-state next-state 2dup ] 2dip
- make-transition table add-transition ;
+ make-transition nfa-table get add-transition ;
: epsilon-transition ( source target -- )
- eps <literal-transition> table add-transition ;
+ epsilon <literal-transition> nfa-table get add-transition ;
M:: star nfa-node ( node -- start end )
node term>> nfa-node :> s1 :> s0
s1 s3 epsilon-transition
s2 s3 ;
-M: epsilon nfa-node
- drop eps literal-transition add-simple-entry ;
+M: tagged-epsilon nfa-node
+ literal-transition add-simple-entry ;
M: concatenation nfa-node ( node -- start end )
[ first>> ] [ second>> ] bi
0 state set
<transition-table> nfa-table set
remove-lookahead nfa-node
- table
+ nfa-table get
swap dup associate >>final-states
swap >>start-state
] with-scope ;
IN: regexp.parser
: allowed-char? ( ch -- ? )
- ".()|[*+?" member? not ;
+ ".()|[*+?$^" member? not ;
ERROR: bad-number ;
{ CHAR: d [ digit-class <primitive-class> ] }
{ CHAR: D [ digit-class <primitive-class> <not-class> ] }
+ { CHAR: z [ end-of-input <tagged-epsilon> ] }
+ { CHAR: A [ beginning-of-input <tagged-epsilon> ] }
[ ]
} case ;
EscapeSequence = "\\" Escape:e => [[ e ]]
-Character = EscapeSequence | . ?[ allowed-char? ]?
+Character = EscapeSequence
+ | "$" => [[ $ <tagged-epsilon> ]]
+ | "^" => [[ ^ <tagged-epsilon> ]]
+ | . ?[ allowed-char? ]?
AnyRangeCharacter = EscapeSequence | .
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
Repeated = Element:e "{" Times:t => [[ e t <times> ]]
+ | Element:e "*+" => [[ e <possessive-star> ]]
+ | Element:e "++" => [[ e <possessive-plus> ]]
| Element:e "?" => [[ e <maybe> ]]
| Element:e "*" => [[ e <star> ]]
| Element:e "+" => [[ e <plus> ]]
USING: regexp tools.test kernel sequences regexp.parser
-regexp.traversal eval strings multiline ;
+regexp.traversal eval strings multiline accessors ;
IN: regexp-tests
\ <regexp> must-infer
! Intersecting classes
[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
+[ t ] [ "ac" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ac" R/ ac|[a-z]b/ matches? ] unit-test
+[ t ] [ "ac" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "πb" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ f ] [ "πc" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+[ f ] [ "Ab" R/ [a-zA-Z]c|\p{Lower}b/ matches? ] unit-test
+
+[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
+[ t ] [ R/ foo/ dfa>> >boolean ] unit-test
! [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer
TUPLE: regexp raw parse-tree options dfa ;
+: make-regexp ( string ast -- regexp )
+ f f <options> f regexp boa ;
+
: <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi*
- 2dup <with-options> ast>dfa
- regexp boa ;
+ f regexp boa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
<PRIVATE
+: get-dfa ( regexp -- dfa )
+ dup dfa>> [ ] [
+ dup
+ [ parse-tree>> ]
+ [ options>> ] bi
+ <with-options> ast>dfa
+ [ >>dfa drop ] keep
+ ] ?if ;
+
: (match) ( string regexp -- dfa-traverser )
- dfa>> <dfa-traverser> do-match ; inline
+ get-dfa <dfa-traverser> do-match ; inline
PRIVATE>
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
lexer get dup still-parsing-line?
[ (parse-token) ] [ drop f ] if
- <optioned-regexp> parsed ;
+ <optioned-regexp> dup get-dfa drop parsed ;
PRIVATE>