TUPLE: at-least n ;
C: <at-least> at-least
-TUPLE: concatenation seq ;
-C: <concatenation> concatenation
+SINGLETON: epsilon
-TUPLE: alternation seq ;
-C: <alternation> alternation
+TUPLE: concatenation first second ;
+
+: <concatenation> ( seq -- concatenation )
+ epsilon [ concatenation boa ] reduce ;
+
+TUPLE: alternation first second ;
+
+: <alternation> ( seq -- alternation )
+ unclip [ alternation boa ] reduce ;
TUPLE: star term ;
C: <star> star
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words
ascii unicode.categories combinators.short-circuit sequences ;
GENERIC: class-member? ( obj class -- ? )
+! When does t get put in?
M: t class-member? ( obj class -- ? ) 2drop f ;
-M: integer class-member? ( obj class -- ? ) 2drop f ;
+M: integer class-member? ( obj class -- ? ) = ;
M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
M: end-of-line class-member? ( obj class -- ? )
2drop f ;
+
+TUPLE: or-class seq ;
+C: <or-class> or-class
+
+TUPLE: not-class class ;
+C: <not-class> not-class
+
+M: or-class class-member?
+ seq>> [ class-member? ] with any? ;
+
+M: not-class class-member?
+ class>> class-member? not ;
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences regexp.transition-tables fry assocs
-accessors locals math sorting arrays sets hashtables regexp.dfa ;
+accessors locals math sorting arrays sets hashtables regexp.dfa
+combinators.short-circuit ;
IN: regexp.minimize
+: number-transitions ( transitions numbering -- new-transitions )
+ dup '[
+ [ _ at ]
+ [ [ first _ at ] assoc-map ] bi*
+ ] assoc-map ;
+
+: table>state-numbers ( table -- assoc )
+ transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
+
+: map-set ( assoc quot -- new-assoc )
+ '[ drop @ dup ] assoc-map ; inline
+
+: rewrite-transitions ( transition-table assoc quot -- transition-table )
+ [
+ [ '[ _ at ] change-start-state ]
+ [ '[ [ _ at ] map-set ] change-final-states ]
+ [ ] tri
+ ] dip '[ _ @ ] change-transitions ; inline
+
+: number-states ( table -- newtable )
+ dup table>state-numbers
+ [ number-transitions ] rewrite-transitions ;
+
+: initially-same? ( s1 s2 transition-table -- ? )
+ {
+ [ drop <= ]
+ [ transitions>> '[ _ at keys ] bi@ set= ]
+ [ final-states>> '[ _ key? ] bi@ = ]
+ } 3&& ;
+
:: initialize-partitions ( transition-table -- partitions )
! Partition table is sorted-array => ?
H{ } clone :> out
transition-table transitions>> keys :> states
states [| s1 |
states [| s2 |
- s1 s2 <= [
- s1 s2 [ transition-table transitions>> at keys ] bi@ set=
- s1 s2 [ transition-table final-states>> key? ] bi@ = and
- [ t s1 s2 2array out set-at ] when
- ] when
+ s1 s2 transition-table initially-same?
+ [ s1 s2 2array out conjoin ] when
] each
] each out ;
'[ _ same-partition? ] assoc-all? ;
: partition-more ( partitions transition-table -- partitions )
- ! This is horribly slow!
over '[ drop first2 _ _ stay-same? ] assoc-filter ;
: partition>classes ( partitions -- synonyms ) ! old-state => new-state
: state-classes ( transition-table -- synonyms )
[ initialize-partitions ] keep
- '[ _ partition-more ] [ ] while-changes
+ '[ _ partition-more ] [ assoc-size ] while-changes
partition>classes ;
: canonical-state? ( state state-classes -- ? )
: rewrite-duplicates ( new-transitions state-classes -- new-transitions )
'[ [ _ at ] assoc-map ] assoc-map ;
-: map-set ( assoc quot -- new-assoc )
- '[ drop @ dup ] assoc-map ; inline
+: combine-transitions ( transitions state-classes -- new-transitions )
+ [ delete-duplicates ] [ rewrite-duplicates ] bi ;
: combine-states ( table -- smaller-table )
dup state-classes
- [
- '[
- _ [ delete-duplicates ]
- [ rewrite-duplicates ] bi
- ] change-transitions
- ]
- [ '[ [ _ at ] map-set ] change-final-states ]
- [ '[ _ at ] change-start-state ]
- tri ;
-
-: number-transitions ( transitions numbering -- new-transitions )
- [
- [ at ]
- [ '[ first _ at ] assoc-map ]
- bi-curry bi*
- ] curry assoc-map ;
-
-: number-states ( table -- newtable )
- dup transitions>> keys <enum> [ swap ] H{ } assoc-map-as
- [ '[ _ at ] change-start-state ]
- [ '[ [ _ at ] map-set ] change-final-states ]
- [ '[ _ number-transitions ] change-transitions ] tri ;
+ [ combine-transitions ] rewrite-transitions ;
: minimize ( table -- minimal-table )
clone number-states combine-states ;
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
+IN: regexp.negation.tests
+
+[
+ ! R/ |[^a]|.+/
+ T{ transition-table
+ { transitions H{
+ { 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } }
+ { 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } }
+ { -1 H{ { any-char -1 } } }
+ } }
+ { start-state 0 }
+ { final-states H{ { 0 0 } { -1 -1 } } }
+ }
+] [
+ ! R/ a/
+ T{ transition-table
+ { transitions H{
+ { 0 H{ { CHAR: a 1 } } }
+ { 1 H{ } }
+ } }
+ { start-state 0 }
+ { final-states H{ { 1 1 } } }
+ } negate-table
+] unit-test
--- /dev/null
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences
+assocs regexp.classes hashtables accessors ;
+IN: regexp.negation
+
+: ast>dfa ( parse-tree -- minimal-dfa )
+ construct-nfa construct-dfa minimize ;
+
+CONSTANT: fail-state -1
+
+: add-default-transition ( state's-transitions -- new-state's-transitions )
+ clone dup
+ [ [ fail-state ] dip keys <or-class> <not-class> ] keep set-at ;
+
+: fail-state-recurses ( transitions -- new-transitions )
+ clone dup
+ [ fail-state any-char associate fail-state ] dip set-at ;
+
+: add-fail-state ( transitions -- new-transitions )
+ [ add-default-transition ] assoc-map
+ fail-state-recurses ;
+
+: assoc>set ( assoc -- keys-set )
+ [ drop dup ] assoc-map ;
+
+: inverse-final-states ( transition-table -- final-states )
+ [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
+
+: negate-table ( transition-table -- transition-table )
+ clone
+ [ add-fail-state ] change-transitions
+ dup inverse-final-states >>final-states ;
+
+! M: negation nfa-node ( node -- )
+! ast>dfa negate-table adjoin-dfa ;
USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories
-regexp.transition-tables words sets
+regexp.transition-tables words sets hashtables
unicode.case.private regexp.ast regexp.classes ;
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
IN: regexp.nfa
-ERROR: feature-is-broken feature ;
-
SYMBOL: negated?
: negate ( -- )
SYMBOL: option-stack
-SYMBOL: combine-stack
-
SYMBOL: state
: next-state ( -- state )
state [ get ] [ inc ] bi ;
SYMBOL: nfa-table
+: table ( -- table ) nfa-table get ;
: set-each ( keys value hashtable -- )
'[ _ swap _ set-at ] each ;
: option? ( obj -- ? )
option-stack get assoc-stack ;
-: set-start-state ( -- nfa-table )
- nfa-table get
- combine-stack get pop first >>start-state ;
-
-GENERIC: nfa-node ( node -- )
+GENERIC: nfa-node ( node -- start-state end-state )
-:: add-simple-entry ( obj class -- )
- [let* | s0 [ next-state ]
- s1 [ next-state ]
- stack [ combine-stack get ]
- table [ nfa-table get ] |
- negated? get [
- s0 f obj class make-transition table add-transition
- s0 s1 <default-transition> table add-transition
- ] [
- s0 s1 obj class make-transition table add-transition
- ] if
- s0 s1 2array stack push
- t s1 table final-states>> set-at ] ;
-
-:: concatenate-nodes ( -- )
- [let* | stack [ combine-stack get ]
- table [ nfa-table get ]
- s2 [ stack peek first ]
- s3 [ stack pop second ]
- s0 [ stack peek first ]
- s1 [ stack pop second ] |
- s1 s2 eps <literal-transition> table add-transition
- s1 table final-states>> delete-at
- s0 s3 2array stack push ] ;
-
-:: alternate-nodes ( -- )
- [let* | stack [ combine-stack get ]
- table [ nfa-table get ]
- s2 [ stack peek first ]
- s3 [ stack pop second ]
- s0 [ stack peek first ]
- s1 [ stack pop second ]
- s4 [ next-state ]
- s5 [ next-state ] |
- s4 s0 eps <literal-transition> table add-transition
- s4 s2 eps <literal-transition> table add-transition
- s1 s5 eps <literal-transition> table add-transition
- s3 s5 eps <literal-transition> table add-transition
- s1 table final-states>> delete-at
- s3 table final-states>> delete-at
- t s5 table final-states>> set-at
- s4 s5 2array stack push ] ;
-
-M: star nfa-node ( node -- )
- term>> nfa-node
- [let* | stack [ combine-stack get ]
- s0 [ stack peek first ]
- s1 [ stack pop second ]
- s2 [ next-state ]
- s3 [ next-state ]
- table [ nfa-table get ] |
- s1 table final-states>> delete-at
- t s3 table final-states>> set-at
- s1 s0 eps <literal-transition> table add-transition
- s2 s0 eps <literal-transition> table add-transition
- s2 s3 eps <literal-transition> table add-transition
- s1 s3 eps <literal-transition> table add-transition
- s2 s3 2array stack push ] ;
-
-M: concatenation nfa-node ( node -- )
- seq>> [ eps literal-transition add-simple-entry ] [
- reversed-regexp option? [ <reversed> ] when
- [ [ nfa-node ] each ]
- [ length 1- [ concatenate-nodes ] times ] bi
- ] if-empty ;
-
-M: alternation nfa-node ( node -- )
- seq>>
- [ [ nfa-node ] each ]
- [ length 1- [ alternate-nodes ] times ] bi ;
-
-M: integer nfa-node ( node -- )
+:: add-simple-entry ( obj class -- start-state end-state )
+ next-state :> s0
+ next-state :> s1
+ negated? get [
+ s0 f obj class make-transition table add-transition
+ s0 s1 <default-transition> table add-transition
+ ] [
+ s0 s1 obj class make-transition table add-transition
+ ] if
+ s0 s1 ;
+
+: epsilon-transition ( source target -- )
+ eps <literal-transition> table add-transition ;
+
+M:: star nfa-node ( node -- start end )
+ node term>> nfa-node :> s1 :> s0
+ next-state :> s2
+ next-state :> s3
+ s1 s0 epsilon-transition
+ s2 s0 epsilon-transition
+ s2 s3 epsilon-transition
+ s1 s3 epsilon-transition
+ s2 s3 ;
+
+M: epsilon nfa-node
+ drop eps literal-transition add-simple-entry ;
+
+M: concatenation nfa-node ( node -- start end )
+ [ first>> ] [ second>> ] bi
+ reversed-regexp option? [ swap ] when
+ [ nfa-node ] bi@
+ [ epsilon-transition ] dip ;
+
+:: alternate-nodes ( s0 s1 s2 s3 -- start end )
+ next-state :> s4
+ next-state :> s5
+ s4 s0 epsilon-transition
+ s4 s2 epsilon-transition
+ s1 s5 epsilon-transition
+ s3 s5 epsilon-transition
+ s4 s5 ;
+
+M: alternation nfa-node ( node -- start end )
+ [ first>> ] [ second>> ] bi
+ [ nfa-node ] bi@
+ alternate-nodes ;
+
+M: integer nfa-node ( node -- start end )
case-insensitive option? [
dup [ ch>lower ] [ ch>upper ] bi
2dup = [
literal-transition add-simple-entry
] [
[ literal-transition add-simple-entry ] bi@
- alternate-nodes drop
+ alternate-nodes [ nip ] dip
] if
] [
literal-transition add-simple-entry
] if ;
-M: primitive-class nfa-node ( node -- )
+M: primitive-class nfa-node ( node -- start end )
class>> dup
{ letter-class LETTER-class } member? case-insensitive option? and
[ drop Letter-class ] when
class-transition add-simple-entry ;
-M: any-char nfa-node ( node -- )
+M: any-char nfa-node ( node -- start end )
[ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ;
-M: negation nfa-node ( node -- )
+M: negation nfa-node ( node -- start end )
negate term>> nfa-node negate ;
-M: range nfa-node ( node -- )
+M: range nfa-node ( node -- start end )
case-insensitive option? [
! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp.
class-transition add-simple-entry
] if ;
-M: with-options nfa-node ( node -- )
+M: with-options nfa-node ( node -- start end )
dup options>> [ tree>> nfa-node ] using-options ;
: construct-nfa ( ast -- nfa-table )
[
negated? off
- V{ } clone combine-stack set
0 state set
<transition-table> clone nfa-table set
nfa-node
- set-start-state
+ table
+ swap dup associate >>final-states
+ swap >>start-state
] with-scope ;
combinators regexp.classes strings splitting peg locals accessors
regexp.ast ;
IN: regexp.parser
+
: allowed-char? ( ch -- ? )
".()|[*+?" member? not ;
| "?" Options:on "-"? Options:off ":" Alternation:a
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
+ | "?~" Alternation:a => [[ a <negation> ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]
assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting regexp.ast ;
+regexp.transition-tables splitting sorting regexp.ast
+regexp.negation ;
IN: regexp
TUPLE: regexp raw parse-tree options dfa ;
: <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi*
- 2dup <with-options> construct-nfa construct-dfa minimize
+ 2dup <with-options> ast>dfa
regexp boa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;