Source file ppx_sedlex.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
open Longident
open Migrate_parsetree
open Ast_405
open Parsetree
open Asttypes
open Ast_helper
open Ast_convenience_405
module Ast_mapper_class = Ast_mapper_class_405
let ocaml_version = Versions.ocaml_405
module Cset = Sedlex_cset
type decision_tree =
| Lte of int * decision_tree * decision_tree
| Table of int * int array
| Return of int
let decision l =
let l = List.map (fun (a, b, i) -> (a, b, Return i)) l in
let rec merge2 = function
| (a1, b1, d1) :: (a2, b2, d2) :: rest ->
let x =
if b1 + 1 = a2 then d2
else Lte (a2 - 1, Return (-1), d2)
in
(a1, b2, Lte (b1, d1, x)) :: merge2 rest
| rest -> rest
in
let rec aux = function
| [(a, b, d)] -> Lte (a - 1, Return (-1), Lte (b, d, Return (-1)))
| [] -> Return (-1)
| l -> aux (merge2 l)
in
aux l
let limit = 8192
let decision_table l =
let rec aux m accu = function
| ((a, b, i) as x)::rem when b < limit && i < 255->
aux (min a m) (x :: accu) rem
| rem -> m, accu, rem
in
let (min, table, rest) = aux max_int [] l in
match table with
| [] -> decision l
| [(min, max, i)] ->
Lte (min - 1, Return (-1), (Lte (max, Return i, decision rest)))
| (_, max, _) :: _ ->
let arr = Array.make (max - min + 1) 0 in
let set (a, b, i) = for j = a to b do arr.(j - min) <- i + 1 done in
List.iter set table;
Lte (min - 1, Return (-1), Lte (max, Table (min, arr), decision rest))
let rec simplify min max = function
| Lte (i,yes,no) ->
if i >= max then simplify min max yes
else if i < min then simplify min max no
else Lte (i, simplify min i yes, simplify (i+1) max no)
| x -> x
let segments_of_partition p =
let seg = ref [] in
Array.iteri
(fun i c -> List.iter (fun (a, b) -> seg := (a, b, i) :: !seg) c)
p;
List.sort (fun (a1,_,_) (a2,_,_) -> compare a1 a2) !seg
let decision_table p =
simplify (-1) (Cset.max_code) (decision_table (segments_of_partition p))
let appfun s l = app (evar s) l
let glb_value name def = Str.value Nonrecursive [Vb.mk (pvar name) def]
module StringMap = Map.Make(struct
type t = string
let compare = compare
end)
let builtin_regexps =
List.fold_left (fun acc (n, c) -> StringMap.add n (Sedlex.chars c) acc)
StringMap.empty
([
"any", Cset.any;
"eof", Cset.eof;
"xml_letter", Cset.letter;
"xml_digit", Cset.digit;
"xml_extender", Cset.extender;
"xml_base_char", Cset.base_char;
"xml_ideographic", Cset.ideographic;
"xml_combining_char", Cset.combining_char;
"xml_blank", Cset.blank;
"tr8876_ident_char", Cset.tr8876_ident_char] @
Unicode.Categories.list @
Unicode.Properties.list)
let tables = Hashtbl.create 31
let table_counter = ref 0
let get_tables () = Hashtbl.fold (fun key x accu -> (x, key) :: accu) tables []
let table_name x =
try Hashtbl.find tables x
with Not_found ->
incr table_counter;
let s = Printf.sprintf "__sedlex_table_%i" !table_counter in
Hashtbl.add tables x s;
s
let table (name, v) =
let n = Array.length v in
let s = Bytes.create n in
for i = 0 to n - 1 do Bytes.set s i (Char.chr v.(i)) done;
glb_value name (str (Bytes.to_string s))
let partitions = Hashtbl.create 31
let partition_counter = ref 0
let get_partitions () = Hashtbl.fold (fun key x accu -> (x, key) :: accu) partitions []
let partition_name x =
try Hashtbl.find partitions x
with Not_found ->
incr partition_counter;
let s = Printf.sprintf "__sedlex_partition_%i" !partition_counter in
Hashtbl.add partitions x s;
s
let partition (name, p) =
let rec gen_tree = function
| Lte (i, yes, no) ->
[%expr if c <= [%e int i] then [%e gen_tree yes] else [%e gen_tree no]]
| Return i -> int i
| Table (offset, t) ->
let c = if offset = 0 then [%expr c] else [%expr c - [%e int offset]] in
[%expr Char.code (String.get [%e evar (table_name t)] [%e c]) - 1]
in
let body = gen_tree (decision_table p) in
glb_value name (func [(pconstr "Some" [pvar "uc"],
[%expr let c = Uchar.to_int uc in [%e body]]);
(pconstr "None" [],
[%expr let c = (-1) in [%e body]])])
let best_final final =
let fin = ref None in
for i = Array.length final - 1 downto 0 do
if final.(i) then fin := Some i
done;
!fin
let state_fun state = Printf.sprintf "__sedlex_state_%i" state
let call_state lexbuf auto state =
let (trans, final) = auto.(state) in
if Array.length trans = 0
then match best_final final with
| Some i -> int i
| None -> assert false
else appfun (state_fun state) [evar lexbuf]
let gen_state lexbuf auto i (trans, final) =
let partition = Array.map fst trans in
let cases = Array.mapi (fun i (_, j) -> Exp.case(pint i) (call_state lexbuf auto j)) trans in
let cases = Array.to_list cases in
let body () =
Exp.match_
(appfun (partition_name partition) [[%expr Sedlexing.next [%e evar lexbuf]]])
(cases @ [Exp.case [%pat? _] [%expr Sedlexing.backtrack [%e evar lexbuf]]])
in
let ret body = [ Vb.mk (pvar (state_fun i)) (func [pvar lexbuf, body]) ] in
match best_final final with
| None -> ret (body ())
| Some _ when Array.length trans = 0 -> []
| Some i -> ret [%expr Sedlexing.mark [%e evar lexbuf] [%e int i]; [%e body ()]]
let gen_recflag auto =
try
Array.iter
(fun (trans_i, _) ->
Array.iter
(fun (_, j) ->
let (trans_j, _) = auto.(j) in
if Array.length trans_j > 0 then raise Exit)
trans_i)
auto;
Nonrecursive
with
Exit -> Recursive
let gen_definition lexbuf l error =
let brs = Array.of_list l in
let auto = Sedlex.compile (Array.map fst brs) in
let cases = Array.to_list (Array.mapi (fun i (_, e) -> Exp.case (pint i) e) brs) in
let states = Array.mapi (gen_state lexbuf auto) auto in
let states = List.flatten (Array.to_list states) in
Exp.let_ (gen_recflag auto) states
(Exp.sequence
[%expr Sedlexing.start [%e evar lexbuf]]
(Exp.match_ (appfun (state_fun 0) [evar lexbuf])
(cases @ [Exp.case (Pat.any ()) error])
)
)
let codepoint i =
if i < 0 || i > Cset.max_code then
failwith (Printf.sprintf "Invalid Unicode code point: %i" i);
i
let regexp_for_char c =
Sedlex.chars (Cset.singleton (Char.code c))
let regexp_for_string s =
let rec aux n =
if n = String.length s then Sedlex.eps
else
Sedlex.seq (regexp_for_char s.[n]) (aux (succ n))
in aux 0
let err loc s =
raise (Location.Error (Location.error ~loc ("Sedlex: " ^ s)))
let rec repeat r = function
| 0, 0 -> Sedlex.eps
| 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1)))
| n, m -> Sedlex.seq r (repeat r (n - 1, m - 1))
let regexp_of_pattern env =
let rec char_pair_op func name p tuple =
match tuple with
| Some {ppat_desc=Ppat_tuple (p0 :: p1 :: [])} ->
begin match func (aux p0) (aux p1) with
| Some r -> r
| None ->
err p.ppat_loc @@
"the "^name^" operator can only applied to single-character length regexps"
end
| _ -> err p.ppat_loc @@ "the "^name^" operator requires two arguments, like "^name^"(a,b)"
and aux p =
match p.ppat_desc with
| Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2)
| Ppat_tuple (p :: pl) ->
List.fold_left (fun r p -> Sedlex.seq r (aux p))
(aux p)
pl
| Ppat_construct ({txt = Lident "Star"}, Some p) ->
Sedlex.rep (aux p)
| Ppat_construct ({txt = Lident "Plus"}, Some p) ->
Sedlex.plus (aux p)
| Ppat_construct
({txt = Lident "Rep"},
Some {ppat_desc=Ppat_tuple[p0; {ppat_desc=Ppat_constant (i1 as i2)|Ppat_interval(i1, i2)}]}) ->
begin match Constant.of_constant i1, Constant.of_constant i2 with
| Pconst_integer(i1,_), Pconst_integer(i2,_) ->
let i1 = int_of_string i1 in
let i2 = int_of_string i2 in
if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2)
else err p.ppat_loc "Invalid range for Rep operator"
| _ ->
err p.ppat_loc "Rep must take an integer constant or interval"
end
| Ppat_construct ({txt = Lident "Rep"}, _) ->
err p.ppat_loc "the Rep operator takes 2 arguments"
| Ppat_construct ({txt = Lident "Opt"}, Some p) ->
Sedlex.alt Sedlex.eps (aux p)
| Ppat_construct ({txt = Lident "Compl"}, arg) ->
begin match arg with
| Some p0 ->
begin match Sedlex.compl (aux p0) with
| Some r -> r
| None ->
err p.ppat_loc
"the Compl operator can only applied to a single-character length regexp"
end
| _ -> err p.ppat_loc "the Compl operator requires an argument"
end
| Ppat_construct ({ txt = Lident "Sub" }, arg) ->
char_pair_op Sedlex.subtract "Sub" p arg
| Ppat_construct ({ txt = Lident "Intersect" }, arg) ->
char_pair_op Sedlex.intersection "Intersect" p arg
| Ppat_construct ({txt = Lident "Chars"}, arg) ->
let const = match arg with
| Some {ppat_desc=Ppat_constant const} ->
Some (Constant.of_constant const)
| _ -> None
in
begin match const with
| Some (Pconst_string(s,_))->
let c = ref Cset.empty in
for i = 0 to String.length s - 1 do
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
done;
Sedlex.chars !c
| _ -> err p.ppat_loc "the Chars operator requires a string argument"
end
| Ppat_interval (i_start, i_end) ->
begin match Constant.of_constant i_start, Constant.of_constant i_end with
| Pconst_char c1, Pconst_char c2 -> Sedlex.chars (Cset.interval (Char.code c1) (Char.code c2))
| Pconst_integer(i1,_), Pconst_integer(i2,_) ->
Sedlex.chars (Cset.interval (codepoint (int_of_string i1)) (codepoint (int_of_string i2)))
| _ -> err p.ppat_loc "this pattern is not a valid interval regexp"
end
| Ppat_constant (const) ->
begin match Constant.of_constant const with
| Pconst_string (s, _) -> regexp_for_string s
| Pconst_char c -> regexp_for_char c
| Pconst_integer(i,_) -> Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
end
| Ppat_var {txt=x} ->
begin try StringMap.find x env
with Not_found ->
err p.ppat_loc (Printf.sprintf "unbound regexp %s" x)
end
| _ ->
err p.ppat_loc "this pattern is not a valid regexp"
in
aux
let mapper cookies =
object(this)
inherit Ast_mapper_class.mapper as super
val env = builtin_regexps
method define_regexp name p =
{< env = StringMap.add name (regexp_of_pattern env p) env >}
method! expr e =
match e with
| [%expr [%sedlex [%e? {pexp_desc=Pexp_match (lexbuf, cases)}]]] ->
let lexbuf =
match lexbuf with
| {pexp_desc=Pexp_ident{txt=Lident lexbuf}} -> lexbuf
| _ ->
err lexbuf.pexp_loc "the matched expression must be a single identifier"
in
let cases = List.rev cases in
let error =
match List.hd cases with
| {pc_lhs = [%pat? _]; pc_rhs = e; pc_guard = None} -> super # expr e
| {pc_lhs = p} ->
err p.ppat_loc "the last branch must be a catch-all error case"
in
let cases = List.rev (List.tl cases) in
let cases =
List.map
(function
| {pc_lhs = p; pc_rhs = e; pc_guard = None} -> regexp_of_pattern env p, super # expr e
| {pc_guard = Some e} ->
err e.pexp_loc "'when' guards are not supported"
) cases
in
gen_definition lexbuf cases error
| [%expr let [%p? {ppat_desc=Ppat_var{txt=name}}] = [%sedlex.regexp? [%p? p]] in [%e? body]] ->
(this # define_regexp name p) # expr body
| [%expr [%sedlex [%e? _]]] ->
err e.pexp_loc "the %sedlex extension is only recognized on match expressions"
| _ -> super # expr e
val toplevel = true
method structure_with_regexps l =
let mapper = ref this in
let regexps = ref [] in
let l = List.concat
(List.map
(function
| [%stri let [%p? {ppat_desc=Ppat_var{txt=name}}] = [%sedlex.regexp? [%p? p]]] as i ->
regexps := i :: !regexps;
mapper := !mapper # define_regexp name p;
[]
| i ->
[ !mapper # structure_item i ]
) l) in
(l, List.rev !regexps)
method! structure l =
if toplevel then
let sub = {< toplevel = false >} in
let previous =
match Driver.get_cookie cookies "sedlex.regexps" ocaml_version with
| Some {pexp_desc = Pexp_extension (_, PStr l)} -> l
| Some _ -> assert false
| None -> []
in
let l, regexps = sub # structure_with_regexps (previous @ l) in
let parts = List.map partition (get_partitions ()) in
let tables = List.map table (get_tables ()) in
Driver.set_cookie cookies "sedlex.regexps" ocaml_version (Exp.extension (Location.mknoloc "regexps", PStr regexps));
tables @ parts @ l
else
fst (this # structure_with_regexps l)
end
let () =
Driver.register
~name:"sedlex"
ocaml_version
(fun _ cookies -> Ast_mapper_class.to_mapper (mapper cookies))