: LETTER? ( ch -- ? ) CHAR: A CHAR: Z between? ; inline\r
: digit? ( ch -- ? ) CHAR: 0 CHAR: 9 between? ; inline\r
: printable? ( ch -- ? ) CHAR: \s CHAR: ~ between? ; inline\r
-: control? ( ch -- ? ) "\0\e\r\n\t\u000008\u00007f" member? ; inline\r
+: control? ( ch -- ? ) { [ 0 HEX: 1F between? ] [ HEX: 7F = ] } 1|| ; inline\r
: quotable? ( ch -- ? ) { [ printable? ] [ "\"\\" member? not ] } 1&& ; inline\r
: Letter? ( ch -- ? ) { [ letter? ] [ LETTER? ] } 1|| ; inline\r
: alpha? ( ch -- ? ) { [ Letter? ] [ digit? ] } 1|| ; inline\r
: >upper ( str -- upper ) [ ch>upper ] map ;\r
\r
HINTS: >lower string ;\r
-HINTS: >upper string ;
\ No newline at end of file
+HINTS: >upper string ;\r
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words regexp.utils
-unicode.categories combinators.short-circuit ;
+ascii unicode.categories combinators.short-circuit ;
IN: regexp.classes
SINGLETONS: any-char any-char-no-nl
drop { [ blank? ] [ CHAR: \n = not ] } 1&& ;
M: control-character-class class-member? ( obj class -- ? )
- drop control-char? ;
+ drop control? ;
M: hex-digit-class class-member? ( obj class -- ? )
drop hex-digit? ;
] if ;
M: capture-group nfa-node ( node -- )
- "capture-groups" feature-is-broken
- eps literal-transition add-simple-entry
- capture-group-on add-traversal-flag
- term>> nfa-node
- eps literal-transition add-simple-entry
- capture-group-off add-traversal-flag
- 2 [ concatenate-nodes ] times ;
+ term>> nfa-node ;
-! xyzzy
M: non-capture-group nfa-node ( node -- )
term>> nfa-node ;
! Dotall mode -- when on, . matches newlines.
! Off by default.
[ f ] [ "\n" "." <regexp> matches? ] unit-test
-[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
+! [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
[ t ] [ "\n" R/ ./s matches? ] unit-test
-[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
+! [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
[ f ] [ "" ".+" <regexp> matches? ] unit-test
[ t ] [ "a" ".+" <regexp> matches? ] unit-test
[ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
-/*
-! FIXME
[ f ] [ "" "(a)" <regexp> matches? ] unit-test
[ t ] [ "a" "(a)" <regexp> matches? ] unit-test
[ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
[ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
[ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
-*/
[ f ] [ "" "a{1}" <regexp> matches? ] unit-test
[ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
[ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
[ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
-/*
-! FIXME
[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
-*/
[ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
[ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
+/*
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
+*/
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
[ t ] [ "A" R/ [a-z]/i matches? ] unit-test
[ t ] [ "abc*" "[^\\*]*\\*" <regexp> matches? ] unit-test
[ t ] [ "bca" "[^a]*a" <regexp> matches? ] unit-test
-/*
-! FIXME
[ ] [
"(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
<regexp> drop
[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test
-*/
! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
/*
-! FIXME
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
#! set the state as a key
2dup [ to>> ] dip maybe-initialize-key
[ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
- 2dup at* [ 2nip insert-at ]
- [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
+ 2dup at* [ 2nip push-at ]
+ [ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ;
: add-transition ( transition transition-table -- )
transitions>> set-transition ;
TUPLE: dfa-traverser
dfa-table
- traversal-flags
- traverse-forward
- lookahead-counters
- lookbehind-counters
- capture-counters
- captured-groups
- capture-group-index
- last-state current-state
+ current-state
text
match-failed?
start-index current-index
matches ;
: <dfa-traverser> ( text regexp -- match )
- [ dfa-table>> ] [ dfa-traversal-flags>> ] bi
+ dfa-table>>
dfa-traverser new
- swap >>traversal-flags
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
swap >>text
- t >>traverse-forward
0 >>start-index
0 >>current-index
- 0 >>capture-group-index
- V{ } clone >>matches
- V{ } clone >>capture-counters
- V{ } clone >>lookbehind-counters
- V{ } clone >>lookahead-counters
- H{ } clone >>captured-groups ;
+ V{ } clone >>matches ;
: final-state? ( dfa-traverser -- ? )
[ current-state>> ]
dup save-final-state
] when text-finished? ;
+: text-character ( dfa-traverser n -- ch )
+ [ text>> ] swap '[ current-index>> _ + ] bi nth ;
+
: previous-text-character ( dfa-traverser -- ch )
- [ text>> ] [ current-index>> 1- ] bi nth ;
+ -1 text-character ;
: current-text-character ( dfa-traverser -- ch )
- [ text>> ] [ current-index>> ] bi nth ;
+ 0 text-character ;
: next-text-character ( dfa-traverser -- ch )
- [ text>> ] [ current-index>> 1+ ] bi nth ;
-
-GENERIC: flag-action ( dfa-traverser flag -- )
-
-
-M: beginning-of-input flag-action ( dfa-traverser flag -- )
- drop
- dup beginning-of-text? [ t >>match-failed? ] unless drop ;
-
-M: end-of-input flag-action ( dfa-traverser flag -- )
- drop
- dup end-of-text? [ t >>match-failed? ] unless drop ;
-
-
-M: beginning-of-line flag-action ( dfa-traverser flag -- )
- drop
- dup {
- [ beginning-of-text? ]
- [ previous-text-character terminator-class class-member? ]
- } 1|| [ t >>match-failed? ] unless drop ;
-
-M: end-of-line flag-action ( dfa-traverser flag -- )
- drop
- dup {
- [ end-of-text? ]
- [ next-text-character terminator-class class-member? ]
- } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: word-boundary flag-action ( dfa-traverser flag -- )
- drop
- dup {
- [ end-of-text? ]
- [ current-text-character terminator-class class-member? ]
- } 1|| [ t >>match-failed? ] unless drop ;
-
-
-M: lookahead-on flag-action ( dfa-traverser flag -- )
- drop
- lookahead-counters>> 0 swap push ;
-
-M: lookahead-off flag-action ( dfa-traverser flag -- )
- drop
- dup lookahead-counters>>
- [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
-
-M: lookbehind-on flag-action ( dfa-traverser flag -- )
- drop
- f >>traverse-forward
- [ 2 - ] change-current-index
- lookbehind-counters>> 0 swap push ;
-
-M: lookbehind-off flag-action ( dfa-traverser flag -- )
- drop
- t >>traverse-forward
- dup lookbehind-counters>>
- [ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ;
-
-M: capture-group-on flag-action ( dfa-traverser flag -- )
- drop
- [ current-index>> 0 2array ]
- [ capture-counters>> ] bi push ;
-
-M: capture-group-off flag-action ( dfa-traverser flag -- )
- drop
- dup capture-counters>> empty? [
- drop
- ] [
- {
- [ capture-counters>> pop first2 dupd + ]
- [ text>> <slice> ]
- [ [ 1+ ] change-capture-group-index capture-group-index>> ]
- [ captured-groups>> set-at ]
- } cleave
- ] if ;
-
-: process-flags ( dfa-traverser -- )
- [ [ 1+ ] map ] change-lookahead-counters
- [ [ 1+ ] map ] change-lookbehind-counters
- [ [ first2 1+ 2array ] map ] change-capture-counters
- ! dup current-state>> .
- dup [ current-state>> ] [ traversal-flags>> ] bi
- at [ flag-action ] with each ;
+ 1 text-character ;
: increment-state ( dfa-traverser state -- dfa-traverser )
- [
- dup traverse-forward>>
- [ [ 1+ ] change-current-index ]
- [ [ 1- ] change-current-index ] if
- dup current-state>> >>last-state
- ] [ first ] bi* >>current-state ;
+ [ [ 1 + ] change-current-index ]
+ [ first ] bi* >>current-state ;
: match-literal ( transition from-state table -- to-state/f )
transitions>> at at ;
: match-class ( transition from-state table -- to-state/f )
transitions>> at* [
- [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if
+ '[ drop _ swap class-member? ] assoc-find [ nip ] [ drop ] if
] [ drop ] if ;
: match-default ( transition from-state table -- to-state/f )
[ dfa-table>> ] tri ;
: do-match ( dfa-traverser -- dfa-traverser )
- dup process-flags
dup match-done? [
dup setup-match match-transition
[ increment-state do-match ] when*
: while-changes ( obj quot pred -- obj' )
pick over call (while-changes) ; inline
-: assoc-with ( param assoc quot -- assoc curry )
- swapd [ [ -rot ] dip call ] 2curry ; inline
-
-: insert-at ( value key hash -- )
- 2dup at* [
- 2nip push
- ] [
- drop
- [ dup vector? [ 1vector ] unless ] 2dip set-at
- ] if ;
-
-: ?insert-at ( value key hash/f -- hash )
- [ H{ } clone ] unless* [ insert-at ] keep ;
-
ERROR: bad-octal number ;
ERROR: bad-hex number ;
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
-: ascii? ( n -- ? ) 0 HEX: 7f between? ;
-: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
: hex-digit? ( n -- ? )
- [
+ {
[ decimal-digit? ]
[ CHAR: a CHAR: f between? ]
[ CHAR: A CHAR: F between? ]
- ] 1|| ;
-
-: control-char? ( n -- ? )
- [
- [ 0 HEX: 1f between? ]
- [ HEX: 7f = ]
- ] 1|| ;
+ } 1|| ;
: punct? ( n -- ? )
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
: c-identifier-char? ( ch -- ? )
- [ [ alpha? ] [ CHAR: _ = ] ] 1|| ;
+ { [ alpha? ] [ CHAR: _ = ] } 1|| ;
: java-blank? ( n -- ? )
{
USING: xmode.loader xmode.utilities xmode.rules namespaces
strings splitting assocs sequences kernel io.files xml memoize
-words globs combinators io.encodings.utf8 sorting accessors xml.data ;
+words globs combinators io.encodings.utf8 sorting accessors xml.data
+xml.traversal xml.syntax ;
IN: xmode.catalog
TUPLE: mode file file-name-glob first-line-glob ;
-<TAGS: parse-mode-tag ( modes tag -- )
+TAGS: parse-mode-tag ( modes tag -- )
-TAG: MODE
+TAG: MODE parse-mode-tag
dup "NAME" attr [
mode new {
{ "FILE" f (>>file) }
] dip
rot set-at ;
-TAGS>
-
: parse-modes-tag ( tag -- modes )
H{ } clone [
- swap child-tags [ parse-mode-tag ] with each
+ swap children-tags [ parse-mode-tag ] with each
] keep ;
MEMO: modes ( -- modes )
] if ;
: finalize-mode ( rulesets -- )
- rule-sets [
- dup [ nip finalize-rule-set ] assoc-each
+ dup rule-sets [
+ [ nip finalize-rule-set ] assoc-each
] with-variable ;
: load-mode ( name -- rule-sets )
USING: xmode.loader.syntax xmode.tokens xmode.rules
xmode.keyword-map xml.data xml.traversal xml assocs kernel
combinators sequences math.parser namespaces parser
-xmode.utilities parser-combinators.regexp io.files accessors ;
+xmode.utilities regexp io.files accessors xml.syntax ;
IN: xmode.loader
! Based on org.gjt.sp.jedit.XModeHandler
! RULES and its children
-<TAGS: parse-rule-tag ( rule-set tag -- )
+TAGS: parse-rule-tag ( rule-set tag -- )
-TAG: PROPS
+TAG: PROPS parse-rule-tag
parse-props-tag >>props drop ;
-TAG: IMPORT
+TAG: IMPORT parse-rule-tag
"DELEGATE" attr swap import-rule-set ;
-TAG: TERMINATE
+TAG: TERMINATE parse-rule-tag
"AT_CHAR" attr string>number >>terminate-char drop ;
-RULE: SEQ seq-rule
+RULE: SEQ seq-rule parse-rule-tag
shared-tag-attrs delegate-attr literal-start ;
-RULE: SEQ_REGEXP seq-rule
+RULE: SEQ_REGEXP seq-rule parse-rule-tag
shared-tag-attrs delegate-attr regexp-attr regexp-start ;
-RULE: SPAN span-rule
+RULE: SPAN span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr span-attrs parse-begin/end-tags init-span-tag ;
-RULE: SPAN_REGEXP span-rule
+RULE: SPAN_REGEXP span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr span-attrs regexp-attr parse-begin/end-tags init-span-tag ;
-RULE: EOL_SPAN eol-span-rule
+RULE: EOL_SPAN eol-span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr literal-start init-eol-span-tag ;
-RULE: EOL_SPAN_REGEXP eol-span-rule
+RULE: EOL_SPAN_REGEXP eol-span-rule parse-rule-tag
shared-tag-attrs delegate-attr match-type-attr regexp-attr regexp-start init-eol-span-tag ;
-RULE: MARK_FOLLOWING mark-following-rule
+RULE: MARK_FOLLOWING mark-following-rule parse-rule-tag
shared-tag-attrs match-type-attr literal-start ;
-RULE: MARK_PREVIOUS mark-previous-rule
+RULE: MARK_PREVIOUS mark-previous-rule parse-rule-tag
shared-tag-attrs match-type-attr literal-start ;
-TAG: KEYWORDS ( rule-set tag -- key value )
+TAG: KEYWORDS parse-rule-tag
rule-set get ignore-case?>> <keyword-map>
- swap child-tags [ over parse-keyword-tag ] each
+ swap children-tags [ over parse-keyword-tag ] each
swap (>>keywords) ;
-TAGS>
-
: ?<regexp> ( string/f -- regexp/f )
- dup [ rule-set get ignore-case?>> <regexp> ] when ;
+ dup [ rule-set get ignore-case?>> <?insensitive-regexp> ] when ;
: (parse-rules-tag) ( tag -- rule-set )
<rule-set> dup rule-set set
: parse-rules-tag ( tag -- rule-set )
[
- [ (parse-rules-tag) ] [ child-tags ] bi
+ [ (parse-rules-tag) ] [ children-tags ] bi
[ parse-rule-tag ] with each
rule-set get
] with-scope ;
USING: accessors xmode.tokens xmode.rules xmode.keyword-map
xml.data xml.traversal xml assocs kernel combinators sequences
math.parser namespaces make parser lexer xmode.utilities
-parser-combinators.regexp io.files splitting arrays ;
+regexp io.files splitting arrays xml.syntax xml.syntax.private ;
IN: xmode.loader.syntax
! Rule tag parsing utilities
new swap init-from-tag swap add-rule ; inline
: RULE:
- scan scan-word
- parse-definition { } make
- swap [ (parse-rule-tag) ] 2curry (TAG:) ; parsing
+ scan scan-word scan-word [
+ parse-definition { } make
+ swap [ (parse-rule-tag) ] 2curry
+ ] dip swap define-tag ; parsing
! Attribute utilities
: string>boolean ( string -- ? ) "TRUE" = ;
[ "NAME" attr ] [ "VALUE" attr ] bi ;
: parse-props-tag ( tag -- assoc )
- child-tags
+ children-tags
[ parse-prop-tag ] H{ } map>assoc ;
: position-attrs ( tag -- at-line-start? at-whitespace-end? at-word-start? )
swap position-attrs <matcher> ;
: parse-regexp-matcher ( tag -- matcher )
- dup children>string rule-set get ignore-case?>> <regexp>
+ dup children>string
+ rule-set get ignore-case?>> <?insensitive-regexp>
swap position-attrs <matcher> ;
: shared-tag-attrs ( -- )
[ parse-literal-matcher >>end drop ] , ;
! SPAN's children
-<TAGS: parse-begin/end-tag ( rule tag -- )
+TAGS: parse-begin/end-tag ( rule tag -- )
-TAG: BEGIN
+TAG: BEGIN parse-begin/end-tag
! XXX
parse-literal-matcher >>start drop ;
-TAG: END
+TAG: END parse-begin/end-tag
! XXX
parse-literal-matcher >>end drop ;
-TAGS>
-
: parse-begin/end-tags ( -- )
[
! XXX: handle position attrs on span tag itself
- child-tags [ parse-begin/end-tag ] with each
+ children-tags [ parse-begin/end-tag ] with each
] , ;
: init-span-tag ( -- ) [ drop init-span ] , ;
USING: kernel namespaces make xmode.rules xmode.tokens
xmode.marker.state xmode.marker.context xmode.utilities
xmode.catalog sequences math assocs combinators strings
-parser-combinators.regexp splitting parser-combinators ascii
+regexp splitting ascii regexp.backend unicode.case
ascii combinators.short-circuit accessors ;
+! regexp.backend is for the regexp class
+
+! Next two words copied from parser-combinators
+! Just like head?, but they optionally ignore case
+
+: string= ( str1 str2 ignore-case -- ? )
+ [ [ >upper ] bi@ ] when sequence= ;
+
+: string-head? ( str1 str2 ignore-case -- ? )
+ 2over shorter?
+ [ 3drop f ] [
+ [
+ [ nip ]
+ [ length head-slice ] 2bi
+ ] dip string=
+ ] if ;
! Based on org.gjt.sp.jedit.syntax.TokenMarker
process-escape? get [
escaped? [ not ] change
position [ + ] change
- ] [ 2drop ] if ;
+ ] [ drop ] if ;
M: seq-rule handle-rule-start
?end-rule
USING: accessors xmode.tokens xmode.keyword-map kernel
sequences vectors assocs strings memoize unicode.case
-parser-combinators.regexp ;
+regexp regexp.backend ; ! regexp.backend has the regexp class
IN: xmode.rules
TUPLE: string-matcher string ignore-case? ;
+USING: assocs xmode.utilities tools.test ;
IN: xmode.utilities.tests
-USING: accessors xmode.utilities tools.test xml xml.data kernel
-strings vectors sequences io.files prettyprint assocs
-unicode.case ;
+
[ "hi" 3 ] [
{ 1 2 3 4 5 6 7 8 } [ H{ { 3 "hi" } } at ] map-find
] unit-test
[ f f ] [
{ 1 2 3 4 5 6 7 8 } [ H{ { 11 "hi" } } at ] map-find
] unit-test
-
-TUPLE: company employees type ;
-
-: <company> V{ } clone f company boa ;
-
-: add-employee employees>> push ;
-
-<TAGS: parse-employee-tag
-
-TUPLE: employee name description ;
-
-TAG: employee
- employee new
- { { "name" f (>>name) } { f (>>description) } }
- init-from-tag swap add-employee ;
-
-TAGS>
-
-\ parse-employee-tag see
-
-: parse-company-tag
- [
- <company>
- { { "type" >upper (>>type) } }
- init-from-tag dup
- ] keep
- children>> [ tag? ] filter
- [ parse-employee-tag ] with each ;
-
-[
- T{ company f
- V{
- T{ employee f "Joe" "VP Sales" }
- T{ employee f "Jane" "CFO" }
- }
- "PUBLIC"
- }
-] [
- "resource:basis/xmode/utilities/test.xml"
- file>xml parse-company-tag
-] unit-test
USING: accessors sequences assocs kernel quotations namespaces
-xml.data xml.traversal combinators macros parser lexer words fry ;
+xml.data xml.traversal combinators macros parser lexer words fry
+regexp ;
IN: xmode.utilities
: implies ( x y -- z ) [ not ] dip or ; inline
-: child-tags ( tag -- seq ) children>> [ tag? ] filter ;
-
: map-find ( seq quot -- result elt )
[ f ] 2dip
'[ nip @ dup ] find
: init-from-tag ( tag tuple specs -- tuple )
over [ (init-from-tag) ] dip ; inline
-SYMBOL: tag-handlers
-SYMBOL: tag-handler-word
-
-: <TAGS:
- CREATE tag-handler-word set
- H{ } clone tag-handlers set ; parsing
-
-: (TAG:) ( name quot -- ) swap tag-handlers get set-at ;
-
-: TAG:
- scan parse-definition
- (TAG:) ; parsing
-
-: TAGS>
- tag-handler-word get
- tag-handlers get >alist [ [ dup main>> ] dip case ] curry
- define ; parsing
+: <?insensitive-regexp> ( string ? -- regexp )
+ "i" "" ? <optioned-regexp> ;