libs/xml/tokenizer.factor

   1 ! Copyright (C) 2005, 2006 Daniel Ehrenberg\r
   2 ! See http://factorcode.org/license.txt for BSD license.\r
   3 IN: xml\r
   4 USING: errors hashtables io kernel math namespaces prettyprint\r
   5 sequences tools generic strings char-classes ;\r
   6 \r
   7 ! -- Low-level parsing\r
   8 ! Code stored in stdio\r
   9 ! Spot is composite so it won't be lost in sub-scopes\r
  10 SYMBOL: spot #! { char line column line-str }\r
  11 : get-char ( -- char ) spot get first ;\r
  12 : set-char ( char -- ) 0 spot get set-nth ;\r
  13 : get-line ( -- line ) spot get second ;\r
  14 : set-line ( line -- ) 1 spot get set-nth ;\r
  15 : get-column ( -- column ) spot get third ;\r
  16 : set-column ( column -- ) 2 spot get set-nth ;\r
  17 : get-line-str ( -- line-str ) 3 spot get nth ;\r
  18 : set-line-str ( line-str -- ) 3 spot get set-nth ;\r
  19 SYMBOL: prolog-data\r
  20 \r
  21 ! Record is composite so it changes in nested scopes\r
  22 SYMBOL: record ! string\r
  23 SYMBOL: now-recording? ! t/f\r
  24 : recording? ( -- t/f ) now-recording? get ;\r
  25 : get-record ( -- sbuf ) record get ;\r
  26 \r
  27 : push-record ( ch -- )\r
  28     get-record push ;\r
  29 : new-record ( -- )\r
  30     SBUF" " clone record set\r
  31     t now-recording? set\r
  32     get-char [ push-record ] when* ;\r
  33 : unrecord ( -- )\r
  34     record get pop* ;\r
  35 \r
  36 : (end-record) ( -- sbuf )\r
  37     f now-recording? set\r
  38     get-record ;\r
  39 : end-record* ( n -- string )\r
  40     (end-record) tuck length swap -\r
  41     head-slice >string ;\r
  42 : end-record ( -- string )\r
  43     get-record length 0 =\r
  44     [ "" f recording? set ]\r
  45     [ 1 end-record* ] if ;\r
  46 \r
  47 !   -- Error reporting\r
  48 \r
  49 TUPLE: xml-error line column ;\r
  50 C: xml-error ( -- xml-error )\r
  51     [ get-line swap set-xml-error-line ] keep\r
  52     [ get-column swap set-xml-error-column ] keep ;\r
  53 \r
  54 : xml-error. ( xml-error -- )\r
  55     "XML error" print\r
  56     "Line: " write dup xml-error-line .\r
  57     "Column: " write xml-error-column . ;\r
  58 \r
  59 TUPLE: expected should-be was ;\r
  60 C: expected ( should-be was -- error )\r
  61     [ <xml-error> swap set-delegate ] keep\r
  62     [ set-expected-was ] keep\r
  63     [ set-expected-should-be ] keep ;\r
  64 \r
  65 M: expected error.\r
  66     dup xml-error.\r
  67     "Token expected: " write dup expected-should-be print\r
  68     "Token present: " write expected-was print ;\r
  69 \r
  70 TUPLE: no-entity thing ;\r
  71 C: no-entity ( string -- entitiy )\r
  72     [ <xml-error> swap set-delegate ] keep\r
  73     [ set-no-entity-thing ] keep ;\r
  74 \r
  75 M: no-entity error.\r
  76     dup xml-error.\r
  77     "Entity does not exist: &" write no-entity-thing write ";" print ;\r
  78 \r
  79 TUPLE: xml-string-error string ;\r
  80 C: xml-string-error ( string -- xml-string-error )\r
  81     [ set-xml-string-error-string ] keep\r
  82     [ <xml-error> swap set-delegate ] keep ;\r
  83 \r
  84 M: xml-string-error error.\r
  85     dup xml-error.\r
  86     xml-string-error-string print ;\r
  87 \r
  88 !   -- Basic utility words\r
  89 \r
  90 : next-line ( -- string )\r
  91     ! read a non-blank line\r
  92     readln dup "" = [ drop next-line ] when ;\r
  93 \r
  94 : (next) ( -- char )\r
  95     get-column get-line-str 2dup length 1- < [\r
  96         >r 1+ dup set-column r> nth\r
  97     ] [\r
  98         2drop 0 set-column\r
  99         next-line dup set-line-str\r
 100         [ first ] [ f ] if*\r
 101         get-line 1+ set-line\r
 102     ] if ;\r
 103 \r
 104 : next ( -- )\r
 105     #! Increment spot.\r
 106     get-char [\r
 107          "XML document unexpectedly ended"\r
 108         <xml-string-error> throw\r
 109     ] unless\r
 110     (next) dup set-char\r
 111     recording? over and [ push-record ] [ drop ] if ;\r
 112 \r
 113 : skip-until ( quot -- )\r
 114     #! quot: ( -- ? )\r
 115     get-char [\r
 116         [ call ] keep swap [ drop ] [\r
 117             next skip-until\r
 118         ] if\r
 119     ] [ 2drop ] if ; inline\r
 120 \r
 121 : take-until ( quot -- string | quot: -- ? )\r
 122     #! Take the substring of a string starting at spot\r
 123     #! from code until the quotation given is true and\r
 124     #! advance spot to after the substring.\r
 125     new-record skip-until end-record ; inline\r
 126 \r
 127 : take-char ( ch -- string )\r
 128     [ dup get-char = ] take-until nip ;\r
 129 \r
 130 : pass-blank ( -- )\r
 131     #! Advance code past any whitespace, including newlines\r
 132     [ get-char blank? not ] skip-until ;\r
 133 \r
 134 : string-matches? ( string -- ? )\r
 135     dup length get-column tuck +\r
 136     dup get-line-str length <=\r
 137     [ get-line-str <slice> sequence= ]\r
 138     [ 3drop f ] if ;\r
 139 \r
 140 : take-string ( match -- string )\r
 141     ! match must not contain a newline\r
 142     [ dup string-matches? ] take-until\r
 143     get-line-str\r
 144     [ "Missing closing token" <xml-string-error> throw ] unless\r
 145     swap length [ next ] times ;\r
 146 \r
 147 !   -- Parsing strings\r
 148 \r
 149 : expect ( ch -- )\r
 150     get-char 2dup = [ 2drop ] [\r
 151         >r ch>string r> ch>string <expected> throw\r
 152     ] if next ;\r
 153 \r
 154 : expect-string* ( num -- )\r
 155     #! only skips string, and only for when you're sure the string is there\r
 156     [ next ] times ;\r
 157 \r
 158 : expect-string ( string -- )\r
 159     ! TODO: add error if this isn't long enough\r
 160     new-record dup length [ next ] times\r
 161     end-record 2dup = [ 2drop ]\r
 162     [ <expected> throw ] if ;\r
 163 \r
 164 TUPLE: prolog version encoding standalone ; ! part of xml-doc, see parser\r
 165 \r
 166 : entities\r
 167     #! We have both directions here as a shortcut.\r
 168     H{\r
 169         { "lt"    CHAR: <  }\r
 170         { "gt"    CHAR: >  }\r
 171         { "amp"   CHAR: &  }\r
 172         { "apos"  CHAR: '  }\r
 173         { "quot"  CHAR: "  }\r
 174         { CHAR: < "&lt;"   }\r
 175         { CHAR: > "&gt;"   }\r
 176         { CHAR: & "&amp;"  }\r
 177         { CHAR: ' "&apos;" }\r
 178         { CHAR: " "&quot;" }\r
 179     } ;\r
 180 \r
 181 TUPLE: entity name ;\r
 182 \r
 183 : (parse-entity) ( string -- )\r
 184     dup entities hash [ push-record ] [ \r
 185         prolog-data get prolog-standalone\r
 186         [ <no-entity> throw ] [\r
 187             end-record , <entity> , next new-record\r
 188         ] if\r
 189     ] ?if ;\r
 190 \r
 191 : parse-entity ( -- )\r
 192     next unrecord unrecord \r
 193     ! the following line is in a scope to shield this\r
 194     ! word from the record-altering side effects of\r
 195     ! take-until.\r
 196     [ CHAR: ; take-char ] with-scope\r
 197     "#" ?head [\r
 198         "x" ?head 16 10 ? base>\r
 199         push-record\r
 200     ] [ (parse-entity) ] if ;\r
 201 \r
 202 TUPLE: reference name ;\r
 203 \r
 204 : parse-reference ( -- )\r
 205     next unrecord end-record , CHAR: ; take-char\r
 206     <reference> , next new-record ;\r
 207 \r
 208 : (parse-char) ( ch -- )\r
 209     get-char {\r
 210         { [ dup not ]\r
 211           [ 2drop 0 end-record* , ] }\r
 212         { [ 2dup = ]\r
 213           [ 2drop end-record , next ] }\r
 214         { [ dup CHAR: & = ]\r
 215           [ drop parse-entity (parse-char) ] }\r
 216         { [ CHAR: % = ] [ parse-reference (parse-char) ] }\r
 217         { [ t ] [ next (parse-char) ] }\r
 218     } cond ;\r
 219 \r
 220 : parse-char ( ch -- array )\r
 221     [ new-record (parse-char) ] { } make ;\r
 222 \r
 223 : parse-quot ( ch -- array )\r
 224     parse-char get-char\r
 225     [ "XML file ends in a quote" <xml-string-error> throw ] unless ;\r
 226 \r
 227 : parse-text ( -- array )\r
 228     CHAR: < parse-char ;\r
 229 \r
 230 !   -- Parsing names\r
 231 \r
 232 TUPLE: name space tag url ;\r
 233 \r
 234 : version=1.0? ( -- ? )\r
 235     prolog-data get prolog-version "1.0" = ;\r
 236 \r
 237 ! version=1.0? is calculated once and passed around for efficiency\r
 238 : name-start-char? ( 1.0? char -- ? )\r
 239     swap [ 1.0name-start-char? ] [ 1.1name-start-char? ] if ;\r
 240 \r
 241 : name-char? ( 1.0? char -- ? )\r
 242     swap [ 1.0name-char? ] [ 1.1name-char? ] if ;\r
 243 \r
 244 : (parse-name) ( -- str )\r
 245     version=1.0? dup\r
 246     new-record get-char name-start-char? [\r
 247         [ dup get-char name-char? not ] skip-until\r
 248         drop end-record\r
 249     ] [\r
 250         "Malformed name" <xml-string-error> throw\r
 251     ] if ;\r
 252 \r
 253 : parse-name ( -- name )\r
 254     (parse-name) get-char CHAR: : =\r
 255     [ next (parse-name) ] [ "" swap ] if f <name> ;\r
 256 \r
 257 : ?= ( object/f object/f -- ? )\r
 258     2dup and [ = ] [ 2drop t ] if ;\r
 259 \r
 260 : names-match? ( name1 name2 -- ? )\r
 261     [ name-space swap name-space ?= ] 2keep\r
 262     [ name-url swap name-url ?= ] 2keep\r
 263     name-tag swap name-tag ?= and and ;\r