basis/regexp/regexp-docs.factor

   1 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: kernel strings help.markup help.syntax math regexp.parser
   4 regexp.ast ;
   5 IN: regexp
   6
   7 ABOUT: "regexp"
   8
   9 ARTICLE: "regexp" "Regular expressions"
  10 "The " { $vocab-link "regexp" } " vocabulary provides word for creating and using regular expressions."
  11 { $subsections { "regexp" "intro" } }
  12 "The class of regular expressions:"
  13 { $subsections regexp }
  14 "Basic usage:"
  15 { $subsections
  16     { "regexp" "syntax" }
  17     { "regexp" "options" }
  18     { "regexp" "construction" }
  19     { "regexp" "operations" }
  20 }
  21 "Advanced topics:"
  22 { $vocab-subsection "Regular expression combinators" "regexp.combinators" }
  23 { $subsections
  24     { "regexp" "theory" }
  25     { "regexp" "deploy" }
  26 } ;
  27
  28 ARTICLE: { "regexp" "intro" } "A quick introduction to regular expressions"
  29 "Regular expressions are a terse way to do certain simple string processing tasks. For example, to replace all instances of " { $snippet "foo" } " in one string with " { $snippet "bar" } ", the following can be used:"
  30 { $code "R/ foo/ \"bar\" re-replace" }
  31 "That could be done with sequence operations, but consider doing this replacement for an arbitrary number of o's, at least two:"
  32 { $code "R/ foo+/ \"bar\" re-replace" }
  33 "The " { $snippet "+" } " operator matches one or more occurrences of the previous expression; in this case " { $snippet "o" } ". Another useful feature is alternation. Say we want to do this replacement with fooooo or boooo. Then we could use the code"
  34 { $code "R/ (f|b)oo+/ \"bar\" re-replace" }
  35 "To search a file for all lines that match a given regular expression, you could use code like this:"
  36 { $code """"file.txt" ascii file-lines [ R/ (f|b)oo+/ re-contains? ] filter""" }
  37 "To test if a string in its entirety matches a regular expression, the following can be used:"
  38 { $example """USE: regexp "fooo" R/ (b|f)oo+/ matches? .""" "t" }
  39 "Regular expressions can't be used for all parsing tasks. For example, they are not powerful enough to match balancing parentheses." ;
  40
  41 ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
  42 "Most of the time, regular expressions are literals and the parsing word should be used, to construct them at parse time. This ensures that they are only compiled once, and gives parse time syntax checking."
  43 { $subsections POSTPONE: R/ }
  44 "Sometimes, regular expressions need to be constructed at run time instead; for example, in a text editor, the user might input a regular expression to search for in a document."
  45 { $subsections <regexp> <optioned-regexp> }
  46 "Another approach is to use " { $vocab-link "regexp.combinators" } "." ;
  47
  48 ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
  49 "Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented."
  50 { $heading "Characters" }
  51 "At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } " for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
  52 { $heading "Concatenation, alternation and grouping" }
  53 "Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for grouping. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
  54 { $heading "Character classes" }
  55 "Square brackets define a convenient way to refer to a set of characters. For example, " { $snippet "[ab]" } " refers to either a or b. And " { $snippet "[a-z]" } " refers to all of the characters between a and z, in code point order. You can use these together, as in " { $snippet "[ac-fz]" } " which matches all of the characters between c and f, in addition to a and z. Character classes can be negated using a caret, as in " { $snippet "[^a]" } " which matches all characters which are not a."
  56 { $heading "Predefined character classes" }
  57 "Several character classes are predefined, both for convenience and because they are too large to represent directly. In Factor regular expressions, all character classes are Unicode-aware."
  58 { $table
  59     { { $snippet "\\d" } "Digits" }
  60     { { $snippet "\\D" } "Not digits" }
  61     { { $snippet "\\s" } "Whitespace" }
  62     { { $snippet "\\S" } "Not whitespace" }
  63     { { $snippet "\\w" } "Word character (alphanumeric or underscore)" }
  64     { { $snippet "\\W" } "Not word character" }
  65     { { $snippet "\\p{property}" } "Character which fulfils the property" }
  66     { { $snippet "\\P{property}" } "Character which does not fulfil the property" } }
  67 "Properties for " { $snippet "\\p" } " and " { $snippet "\\P" } " (case-insensitive):"
  68 { $table
  69     { { $snippet "\\p{lower}" } "Lower case letters" }
  70     { { $snippet "\\p{upper}" } "Upper case letters" }
  71     { { $snippet "\\p{alpha}" } "Letters" }
  72     { { $snippet "\\p{ascii}" } "Characters in the ASCII range" }
  73     { { $snippet "\\p{alnum}" } "Letters or numbers" }
  74     { { $snippet "\\p{punct}" } "Punctuation" }
  75     { { $snippet "\\p{blank}" } "Non-newline whitespace" }
  76     { { $snippet "\\p{cntrl}" } "Control character" }
  77     { { $snippet "\\p{space}" } "Whitespace" }
  78     { { $snippet "\\p{xdigit}" } "Hexadecimal digit" }
  79     { { $snippet "\\p{Nd}" } "Character in Unicode category Nd" }
  80     { { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" }
  81     { { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
  82 { $heading "Character class operations" }
  83 "Character classes can be composed using four binary operations: " { $snippet "|| && ~~ --" } ". These do the operations union, intersection, symmetric difference and difference, respectively. For example, characters which are lower-case but not Latin script could be matched as " { $snippet "[\\p{lower}--\\p{script=latin}]" } ". These operations are right-associative, and " { $snippet "^" } " binds tighter than them. There is no syntax for grouping."
  84 { $heading "Boundaries" }
  85 "Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
  86 { $table
  87     { { $snippet "^" } "Beginning of a line" }
  88     { { $snippet "$" } "End of a line" }
  89     { { $snippet "\\A" } "Beginning of text" }
  90     { { $snippet "\\z" } "End of text" }
  91     { { $snippet "\\Z" } "Almost end of text: only thing after is newline" }
  92     { { $snippet "\\b" } "Word boundary (by Unicode word boundaries)" }
  93     { { $snippet "\\b" } "Not word boundary (by Unicode word boundaries)" } }
  94 { $heading "Greedy quantifiers" }
  95 "It is possible to have a regular expression which matches a variable number of occurrences of another regular expression."
  96 { $table
  97     { { $snippet "a*" } "Zero or more occurrences of a" }
  98     { { $snippet "a+" } "One or more occurrences of a" }
  99     { { $snippet "a?" } "Zero or one occurrences of a" }
 100     { { $snippet "a{n}" } "n occurrences of a" }
 101     { { $snippet "a{n,}" } "At least n occurrences of a" }
 102     { { $snippet "a{,m}" } "At most m occurrences of a" }
 103     { { $snippet "a{n,m}" } "Between n and m occurrences of a" } }
 104 "All of these quantifiers are " { $emphasis "greedy" } ", meaning that they take as many repetitions as possible within the larger regular expression. Reluctant and posessive quantifiers are not yet supported."
 105 { $heading "Lookaround" }
 106 "Operators are provided to look ahead and behind the current point in the regular expression. These can be used in any context, but they're the most useful at the beginning or end of a regular expression."
 107 { $table
 108     { { $snippet "(?=a)" } "Asserts that the current position is immediately followed by a" }
 109     { { $snippet "(?!a)" } "Asserts that the current position is not immediately followed by a" }
 110     { { $snippet "(?<=a)" } "Asserts that the current position is immediately preceded by a" }
 111     { { $snippet "(?<!a)" } "Asserts that the current position is not immediately preceded by a" } }
 112 { $heading "Quotation" }
 113 "To make it convenient to have a long string which uses regexp operators, a special syntax is provided. If a substring begins with " { $snippet "\\Q" } " then everything until " { $snippet "\\E" } " is quoted (escaped). For example, " { $snippet "R/ \\Qfoo\\bar|baz()\\E/" } " matches exactly the string " { $snippet "\"foo\\bar|baz()\"" } "."
 114 { $heading "Unsupported features" }
 115 { $subheading "Group capture" }
 116 { $subheading "Reluctant and posessive quantifiers" }
 117 { $subheading "Backreferences" }
 118 "Backreferences were omitted because of a design decision to allow only regular expressions following the formal theory of regular languages. For more information, see " { $link { "regexp" "theory" } } "."
 119 $nl
 120 "To work around the lack of backreferences, consider using group capture and then creating a new regular expression to match the captured string using " { $vocab-link "regexp.combinators" } "."
 121 { $subheading "Previous match" }
 122 "Another feature that is not included is Perl's " { $snippet "\\G" } " syntax, which references the previous match. This is because that sequence is inherently stateful, and Factor regexps don't hold state."
 123 { $subheading "Embedding code" }
 124 "Operations which embed code into a regexp are not supported. This would require the inclusion of the Factor parser and compiler in any deployed application which wants to expose regexps to the user, leading to an undesirable increase in the code size."
 125 { $heading "Casing operations" }
 126 "No special casing operations are included, for example Perl's " { $snippet "\\L" } "." ;
 127
 128 ARTICLE: { "regexp" "options" } "Regular expression options"
 129 "When " { $link { "regexp" "construction" } } ", various options can be provided. Options have single-character names. A string of options has one of the following two forms:"
 130 { $code "on" "on-off" }
 131 "The latter syntax allows some options to be disabled. The " { $snippet "on" } " and " { $snippet "off" } " strings name options to be enabled and disabled, respectively."
 132 $nl
 133 "The following options are supported:"
 134 { $table
 135   { "i" { $link case-insensitive } }
 136   { "d" { $link unix-lines } }
 137   { "m" { $link multiline } }
 138   { "s" { $link dotall } }
 139   { "r" { $link reversed-regexp } }
 140 } ;
 141
 142 HELP: case-insensitive
 143 { $syntax "R/ .../i" }
 144 { $description "On regexps, the " { $snippet "i" } " option makes the match case-insenstive. Currently, this is handled incorrectly with respect to Unicode, as characters like ß do not expand into SS in upper case. This should be fixed in a future version." } ;
 145
 146 HELP: unix-lines
 147 { $syntax "R/ .../d" }
 148 { $description "With this mode, only newlines (" { $snippet "\\n" } ") are recognized for line breaking. This affects " { $snippet "$" } " and " { $snippet "^" } " when in multiline mode." } ;
 149
 150 HELP: multiline
 151 { $syntax "R/ .../m" }
 152 { $description "This mode makes the zero-width constraints " { $snippet "$" } " and " { $snippet "^" } " match the beginning or end of a line. Otherwise, they only match the beginning or end of the input text. This can be used together with " { $link dotall } "." } ;
 153
 154 HELP: dotall
 155 { $syntax "R/ .../s" }
 156 { $description "This mode, traditionally called single line mode, makes " { $snippet "." } " match everything, including line breaks. By default, it does not match line breaking characters. This can be used together with " { $link multiline } "." } ;
 157
 158 HELP: reversed-regexp
 159 { $syntax "R/ .../r" }
 160 { $description "When running a regexp compiled with this mode, matches will start from the end of the input string, going towards the beginning." } ;
 161
 162 ARTICLE: { "regexp" "theory" } "The theory of regular expressions"
 163 "Far from being just a practical tool invented by Unix hackers, regular expressions were studied formally before computer programs were written to process them." $nl
 164 "A regular language is a set of strings that is matched by a regular expression, which is defined to have characters and the empty string, along with the operations concatenation, disjunction and Kleene star. Another way to define the class of regular languages is as the class of languages which can be recognized with constant space overhead, ie with a DFA. These two definitions are provably equivalent." $nl
 165 "One basic result in the theory of regular language is that the complement of a regular language is regular. In other words, for any regular expression, there exists another regular expression which matches exactly the strings that the first one doesn't match." $nl
 166 "This implies, by DeMorgan's law, that, if you have two regular languages, their intersection is also regular. That is, for any two regular expressions, there exists a regular expression which matches strings that match both inputs." $nl
 167 "Traditionally, regular expressions on computer support an additional operation: backreferences. For example, the Perl regexp " { $snippet "/(.*)$1/" } " matches a string repated twice. If a backreference refers to a string with a predetermined maximum length, then the resulting language is still regular." $nl
 168 "But, if not, the language is not regular. There is strong evidence that there is no efficient way to parse with backreferences in the general case. Perl uses a naive backtracking algorithm which has pathological behavior in some cases, taking exponential time to match even if backreferences aren't used. Additionally, expressions with backreferences don't have the properties with negation and intersection described above." $nl
 169 "The Factor regular expression engine was built with the design decision to support negation and intersection at the expense of backreferences. This lets us have a guaranteed linear-time matching algorithm. Systems like Ragel and Lex use the same algorithm." ;
 170
 171 ARTICLE: { "regexp" "operations" } "Matching operations with regular expressions"
 172 "Testing if a string matches a regular expression:"
 173 { $subsections matches? }
 174 "Finding a match inside a string:"
 175 { $subsections re-contains? first-match }
 176 "Finding all matches inside a string:"
 177 { $subsections
 178     count-matches
 179     all-matching-slices
 180     all-matching-subseqs
 181 }
 182 "Splitting a string into tokens delimited by a regular expression:"
 183 { $subsections re-split }
 184 "Replacing occurrences of a regular expression with a string:"
 185 { $subsections re-replace } ;
 186
 187 ARTICLE: { "regexp" "deploy" } "Regular expressions and the deploy tool"
 188 "The " { $link "tools.deploy" } " tool has the option to strip out the optimizing compiler from the resulting image. Since regular expressions compile to Factor code, this creates a minor performance-related caveat."
 189 $nl
 190 "Regular expressions constructed at runtime from a deployed application will be compiled with the non-optimizing compiler, which is always available because it is built into the Factor VM. This will result in lower performance than when using the optimizing compiler."
 191 $nl
 192 "Literal regular expressions constructed at parse time do not suffer from this restriction, since the deployed application is loaded and compiled before anything is stripped out."
 193 $nl
 194 "None of this applies to deployed applications which include the optimizing compiler, or code running inside a development image."
 195 { $see-also "compiler" { "regexp" "construction" } "deploy-flags" } ;
 196
 197 HELP: <regexp>
 198 { $values { "string" string } { "regexp" regexp } }
 199 { $description "Creates a regular expression object, given a string in regular expression syntax. When it is first used for matching, a DFA is compiled, and this DFA is stored for reuse so it is only compiled once." } ;
 200
 201 HELP: <optioned-regexp>
 202 { $values { "string" string } { "options" "a string of " { $link { "regexp" "options" } } } { "regexp" regexp } }
 203 { $description "Given a string in regular expression syntax, and a string of options, creates a regular expression object. When it is first used for matching, a DFA is compiled, and this DFA is stored for reuse so it is only compiled once." } ;
 204
 205 HELP: R/
 206 { $syntax "R/ foo.*|[a-zA-Z]bar/options" }
 207 { $description "Literal syntax for a regular expression. When this syntax is used, the DFA is compiled at compile-time, rather than on first use. The syntax for the " { $snippet "options" } " string is documented in " { $link { "regexp" "options" } } "." } ;
 208
 209 HELP: regexp
 210 { $class-description "The class of regular expressions. To construct these, see " { $link { "regexp" "construction" } } "." } ;
 211
 212 HELP: matches?
 213 { $values { "string" string } { "regexp" regexp } { "?" "a boolean" } }
 214 { $description "Tests if the string as a whole matches the given regular expression." } ;
 215
 216 HELP: all-matching-slices
 217 { $values { "string" string } { "regexp" regexp } { "seq" "a sequence of slices of the input" } }
 218 { $description "Finds a sequence of disjoint substrings which each match the pattern. It chooses this by finding the leftmost longest match, and then the leftmost longest match which starts after the end of the previous match, and so on." } ;
 219
 220 HELP: count-matches
 221 { $values { "string" string } { "regexp" regexp } { "n" integer } }
 222 { $description "Counts how many disjoint matches the regexp has in the string, as made unambiguous by " { $link all-matching-slices } "." } ;
 223
 224 HELP: re-split
 225 { $values { "string" string } { "regexp" regexp } { "seq" "a sequence of slices of the input" } }
 226 { $description "Splits the input string into chunks separated by the regular expression. Each chunk contains no match of the regexp. The chunks are chosen by the strategy of " { $link all-matching-slices } "." } ;
 227
 228 HELP: re-replace
 229 { $values { "string" string } { "regexp" regexp } { "replacement" string } { "result" string } }
 230 { $description "Replaces substrings which match the input regexp with the given replacement text. The boundaries of the substring are chosen by the strategy used by " { $link all-matching-slices } "." } ;
 231
 232 HELP: first-match
 233 { $values { "string" string } { "regexp" regexp } { "slice/f" "the match, if one exists" } }
 234 { $description "Finds the first match of the regular expression in the string, and returns it as a slice. If there is no match, then " { $link f } " is returned." } ;
 235
 236 HELP: re-contains?
 237 { $values { "string" string } { "regexp" regexp } { "?" "a boolean" } }
 238 { $description "Determines whether the string has a substring which matches the regular expression given." } ;