extra/regexp/regexp.factor

   1 USING: arrays combinators kernel lists math math.parser
   2 namespaces parser lexer parser-combinators parser-combinators.simple
   3 promises quotations sequences combinators.lib strings math.order
   4 assocs prettyprint.backend memoize unicode.case unicode.categories
   5 combinators.short-circuit accessors ;
   6 USE: io
   7 IN: regexp
   8
   9 <PRIVATE
  10
  11 SYMBOL: ignore-case?
  12
  13 : char=-quot ( ch -- quot )
  14     ignore-case? get
  15     [ ch>upper [ swap ch>upper = ] ] [ [ = ] ] if
  16     curry ;
  17
  18 : char-between?-quot ( ch1 ch2 -- quot )
  19     ignore-case? get
  20     [ [ ch>upper ] bi@ [ >r >r ch>upper r> r> between? ] ]
  21     [ [ between? ] ]
  22     if 2curry ;
  23
  24 : <@literal ( parser obj -- action ) [ nip ] curry <@ ;
  25
  26 : <@delay ( parser quot -- action ) [ curry ] curry <@ ;
  27
  28 PRIVATE>
  29
  30 : ascii? ( n -- ? )
  31     0 HEX: 7f between? ;
  32
  33 : octal-digit? ( n -- ? )
  34     CHAR: 0 CHAR: 7 between? ;
  35
  36 : decimal-digit? ( n -- ? )
  37     CHAR: 0 CHAR: 9 between? ;
  38
  39 : hex-digit? ( n -- ? )
  40     dup decimal-digit?
  41     over CHAR: a CHAR: f between? or
  42     swap CHAR: A CHAR: F between? or ;
  43
  44 : control-char? ( n -- ? )
  45     dup 0 HEX: 1f between?
  46     swap HEX: 7f = or ;
  47
  48 : punct? ( n -- ? )
  49     "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
  50
  51 : c-identifier-char? ( ch -- ? )
  52     dup alpha? swap CHAR: _ = or ;
  53
  54 : java-blank? ( n -- ? )
  55     {
  56         CHAR: \s
  57         CHAR: \t CHAR: \n CHAR: \r
  58         HEX: c HEX: 7 HEX: 1b
  59     } member? ;
  60
  61 : java-printable? ( n -- ? )
  62     dup alpha? swap punct? or ;
  63
  64 : 'ordinary-char' ( -- parser )
  65     [ "\\^*+?|(){}[$" member? not ] satisfy
  66     [ char=-quot ] <@ ;
  67
  68 : 'octal-digit' ( -- parser ) [ octal-digit? ] satisfy ;
  69
  70 : 'octal' ( -- parser )
  71     "0" token 'octal-digit' 1 3 from-m-to-n &>
  72     [ oct> ] <@ ;
  73
  74 : 'hex-digit' ( -- parser ) [ hex-digit? ] satisfy ;
  75
  76 : 'hex' ( -- parser )
  77     "x" token 'hex-digit' 2 exactly-n &>
  78     "u" token 'hex-digit' 6 exactly-n &> <|>
  79     [ hex> ] <@ ;
  80
  81 : satisfy-tokens ( assoc -- parser )
  82     [ >r token r> <@literal ] { } assoc>map <or-parser> ;
  83
  84 : 'simple-escape-char' ( -- parser )
  85     {
  86         { "\\" CHAR: \\ }
  87         { "t"  CHAR: \t }
  88         { "n"  CHAR: \n }
  89         { "r"  CHAR: \r }
  90         { "f"  HEX: c   }
  91         { "a"  HEX: 7   }
  92         { "e"  HEX: 1b  }
  93     } [ char=-quot ] assoc-map satisfy-tokens ;
  94
  95 : 'predefined-char-class' ( -- parser )
  96     {
  97         { "d" [ digit? ] }
  98         { "D" [ digit? not ] }
  99         { "s" [ java-blank? ] }
 100         { "S" [ java-blank? not ] }
 101         { "w" [ c-identifier-char? ] }
 102         { "W" [ c-identifier-char? not ] }
 103     } satisfy-tokens ;
 104
 105 : 'posix-character-class' ( -- parser )
 106     {
 107         { "Lower" [ letter? ] }
 108         { "Upper" [ LETTER? ] }
 109         { "ASCII" [ ascii? ] }
 110         { "Alpha" [ Letter? ] }
 111         { "Digit" [ digit? ] }
 112         { "Alnum" [ alpha? ] }
 113         { "Punct" [ punct? ] }
 114         { "Graph" [ java-printable? ] }
 115         { "Print" [ java-printable? ] }
 116         { "Blank" [ " \t" member? ] }
 117         { "Cntrl" [ control-char? ] }
 118         { "XDigit" [ hex-digit? ] }
 119         { "Space" [ java-blank? ] }
 120     } satisfy-tokens "p{" "}" surrounded-by ;
 121
 122 : 'simple-escape' ( -- parser )
 123     'octal'
 124     'hex' <|>
 125     "c" token [ LETTER? ] satisfy &> <|>
 126     any-char-parser <|>
 127     [ char=-quot ] <@ ;
 128
 129 : 'escape' ( -- parser )
 130     "\\" token
 131     'simple-escape-char'
 132     'predefined-char-class' <|>
 133     'posix-character-class' <|>
 134     'simple-escape' <|> &> ;
 135
 136 : 'any-char' ( -- parser )
 137     "." token [ drop t ] <@literal ;
 138
 139 : 'char' ( -- parser )
 140     'any-char' 'escape' 'ordinary-char' <|> <|> [ satisfy ] <@ ;
 141
 142 DEFER: 'regexp'
 143
 144 TUPLE: group-result str ;
 145
 146 C: <group-result> group-result
 147
 148 : 'non-capturing-group' ( -- parser )
 149     "?:" token 'regexp' &> ;
 150
 151 : 'positive-lookahead-group' ( -- parser )
 152     "?=" token 'regexp' &> [ ensure ] <@ ;
 153
 154 : 'negative-lookahead-group' ( -- parser )
 155     "?!" token 'regexp' &> [ ensure-not ] <@ ;
 156
 157 : 'simple-group' ( -- parser )
 158     'regexp' [ [ <group-result> ] <@ ] <@ ;
 159
 160 : 'group' ( -- parser )
 161     'non-capturing-group'
 162     'positive-lookahead-group'
 163     'negative-lookahead-group'
 164     'simple-group' <|> <|> <|>
 165     "(" ")" surrounded-by ;
 166
 167 : 'range' ( -- parser )
 168     [ CHAR: ] = not ] satisfy "-" token <&
 169     [ CHAR: ] = not ] satisfy <&>
 170     [ first2 char-between?-quot ] <@ ;
 171
 172 : 'character-class-term' ( -- parser )
 173     'range'
 174     'escape' <|>
 175     [ "\\]" member? not ] satisfy [ char=-quot ] <@ <|> ;
 176
 177 : 'positive-character-class' ( -- parser )
 178     "]" token [ CHAR: ] = ] <@literal 'character-class-term' <*> <&:>
 179     'character-class-term' <+> <|>
 180     [ [ 1|| ] curry ] <@ ;
 181
 182 : 'negative-character-class' ( -- parser )
 183     "^" token 'positive-character-class' &>
 184     [ [ not ] append ] <@ ;
 185
 186 : 'character-class' ( -- parser )
 187     'negative-character-class' 'positive-character-class' <|>
 188     "[" "]" surrounded-by [ satisfy ] <@ ;
 189
 190 : 'escaped-seq' ( -- parser )
 191     any-char-parser <*>
 192     [ ignore-case? get <token-parser> ] <@
 193     "\\Q" "\\E" surrounded-by ;
 194
 195 : 'break' ( quot -- parser )
 196     satisfy ensure epsilon just <|> ;
 197
 198 : 'break-escape' ( -- parser )
 199     "$" token [ "\r\n" member? ] 'break' <@literal
 200     "\\b" token [ blank? ] 'break' <@literal <|>
 201     "\\B" token [ blank? not ] 'break' <@literal <|>
 202     "\\z" token epsilon just <@literal <|> ;
 203
 204 : 'simple' ( -- parser )
 205     'escaped-seq'
 206     'break-escape' <|>
 207     'group' <|>
 208     'character-class' <|>
 209     'char' <|> ;
 210
 211 : 'exactly-n' ( -- parser )
 212     'integer' [ exactly-n ] <@delay ;
 213
 214 : 'at-least-n' ( -- parser )
 215     'integer' "," token <& [ at-least-n ] <@delay ;
 216
 217 : 'at-most-n' ( -- parser )
 218     "," token 'integer' &> [ at-most-n ] <@delay ;
 219
 220 : 'from-m-to-n' ( -- parser )
 221     'integer' "," token <& 'integer' <&> [ first2 from-m-to-n ] <@delay ;
 222
 223 : 'greedy-interval' ( -- parser )
 224     'exactly-n' 'at-least-n' <|> 'at-most-n' <|> 'from-m-to-n' <|> ;
 225
 226 : 'interval' ( -- parser )
 227     'greedy-interval'
 228     'greedy-interval' "?" token <& [ "reluctant {}" print ] <@ <|>
 229     'greedy-interval' "+" token <& [ "possessive {}" print ] <@ <|>
 230     "{" "}" surrounded-by ;
 231
 232 : 'repetition' ( -- parser )
 233     ! Posessive
 234     "*+" token [ <!*> ] <@literal
 235     "++" token [ <!+> ] <@literal <|>
 236     "?+" token [ <!?> ] <@literal <|>
 237     ! Reluctant
 238     "*?" token [ <(*)> ] <@literal <|>
 239     "+?" token [ <(+)> ] <@literal <|>
 240     "??" token [ <(?)> ] <@literal <|>
 241     ! Greedy
 242     "*" token [ <*> ] <@literal <|>
 243     "+" token [ <+> ] <@literal <|>
 244     "?" token [ <?> ] <@literal <|> ;
 245
 246 : 'dummy' ( -- parser )
 247     epsilon [ ] <@literal ;
 248
 249 MEMO: 'term' ( -- parser )
 250     'simple'
 251     'repetition' 'interval' 'dummy' <|> <|> <&> [ first2 call ] <@
 252     <!+> [ <and-parser> ] <@ ;
 253
 254 LAZY: 'regexp' ( -- parser )
 255     'term' "|" token nonempty-list-of [ <or-parser> ] <@ ;
 256 !    "^" token 'term' "|" token nonempty-list-of [ <or-parser> ] <@
 257 !        &> [ "caret" print ] <@ <|>
 258 !    'term' "|" token nonempty-list-of [ <or-parser> ] <@
 259 !        "$" token <& [ "dollar" print ] <@ <|>
 260 !    "^" token 'term' "|" token nonempty-list-of [ <or-parser> ] <@ &>
 261 !        "$" token [ "caret dollar" print ] <@ <& <|> ;
 262
 263 TUPLE: regexp source parser ignore-case? ;
 264
 265 : <regexp> ( string ignore-case? -- regexp )
 266     [
 267         ignore-case? [
 268             dup 'regexp' just parse-1
 269         ] with-variable
 270     ] keep regexp boa ;
 271
 272 : do-ignore-case ( string regexp -- string regexp )
 273     dup ignore-case?>> [ >r >upper r> ] when ;
 274
 275 : matches? ( string regexp -- ? )
 276     do-ignore-case parser>> just parse nil? not ;
 277
 278 : match-head ( string regexp -- end )
 279     do-ignore-case parser>> parse dup nil?
 280     [ drop f ] [ car unparsed>> from>> ] if ;
 281
 282 ! Literal syntax for regexps
 283 : parse-options ( string -- ? )
 284     #! Lame
 285     {
 286         { "" [ f ] }
 287         { "i" [ t ] }
 288     } case ;
 289
 290 : parse-regexp ( accum end -- accum )
 291     lexer get dup skip-blank
 292     [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
 293     lexer get dup still-parsing-line?
 294     [ (parse-token) parse-options ] [ drop f ] if
 295     <regexp> parsed ;
 296
 297 : R! CHAR: ! parse-regexp ; parsing
 298 : R" CHAR: " parse-regexp ; parsing
 299 : R# CHAR: # parse-regexp ; parsing
 300 : R' CHAR: ' parse-regexp ; parsing
 301 : R( CHAR: ) parse-regexp ; parsing
 302 : R/ CHAR: / parse-regexp ; parsing
 303 : R@ CHAR: @ parse-regexp ; parsing
 304 : R[ CHAR: ] parse-regexp ; parsing
 305 : R` CHAR: ` parse-regexp ; parsing
 306 : R{ CHAR: } parse-regexp ; parsing
 307 : R| CHAR: | parse-regexp ; parsing
 308
 309 : find-regexp-syntax ( string -- prefix suffix )
 310     {
 311         { "R/ "  "/"  }
 312         { "R! "  "!"  }
 313         { "R\" " "\"" }
 314         { "R# "  "#"  }
 315         { "R' "  "'"  }
 316         { "R( "  ")"  }
 317         { "R@ "  "@"  }
 318         { "R[ "  "]"  }
 319         { "R` "  "`"  }
 320         { "R{ "  "}"  }
 321         { "R| "  "|"  }
 322     } swap [ subseq? not nip ] curry assoc-find drop ;
 323
 324 M: regexp pprint*
 325     [
 326         dup source>>
 327         dup find-regexp-syntax swap % swap % %
 328         dup ignore-case?>> [ "i" % ] when
 329     ] "" make
 330     swap present-text ;