Source file regexp.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
let mkloc = Location.mkloc
let (%) f g x = f (g x)
type 'a t = 'a node Location.loc
and 'a node =
| Code of 'a
| Seq of 'a t list
| Alt of 'a t list
| Opt of 'a t
| Repeat of (int * int option) Location.loc * 'a t
| Nongreedy of 'a t
| Capture of 'a t
| Capture_as of string Location.loc * 'a t
| Call of Longident.t Location.loc
let nonepsilon = function {Location.txt = Seq []; _} -> false | _ -> true
let simplify_seq ~loc es =
(match List.filter nonepsilon es with
| [e] -> e
| es -> mkloc (Seq es) loc)
let simplify_alt es =
(match es with
| [e] -> e.Location.txt
| es -> Alt es)
module Int_map = struct
module M = Map.Make (struct type t = int let compare = compare end)
[@@@ocaml.warning "-32"]
let find_last f m = List.find (fun (k, _) -> f k) (List.rev (M.bindings m))
include M
end
let parse_exn ?(pos = Lexing.dummy_pos) s =
let l = String.length s in
let get i = if i = l then ')' else s.[i] in
let position_of_index =
if pos = Lexing.dummy_pos then (fun _ -> Lexing.dummy_pos) else
let newlines =
let rec loop acc lnum i =
if i = l then acc else
if s.[i] <> '\n' then loop acc lnum (i + 1) else
loop (Int_map.add (i + 1) (lnum + 1) acc) (lnum + 1) (i + 1)
in
loop (Int_map.singleton 0 pos.pos_lnum) pos.pos_lnum 0
in
fun i ->
let j, pos_lnum = Int_map.find_last (fun j -> j <= i) newlines in
{ pos with
pos_lnum;
pos_bol = pos.pos_bol + j;
pos_cnum = pos.pos_cnum + i; }
in
let make_loc (i, j) =
let open Location in
if pos = Lexing.dummy_pos then Location.none else
{ loc_start = position_of_index i;
loc_end = position_of_index j;
loc_ghost = false }
in
let wrap_loc (i, j) x = Location.{txt = x; loc = make_loc (i, j)} in
let with_loc f i = let j, e = f i in j, wrap_loc (i, j) e in
let suffix_loc j f (e : _ Location.loc) =
let e' = f e in
if pos = Lexing.dummy_pos then Location.mknoloc e' else
let loc = Location.{
loc_start = e.loc.loc_start;
loc_end = position_of_index j;
loc_ghost = false;
} in
mkloc e' loc
in
let fail (i, j) msg = Location.raise_errorf ~loc:(make_loc (i, j)) "%s" msg in
let scan_ident i =
let rec scan_cont j =
(match get j with
| 'A'..'Z' | 'a'..'z' | '0'..'9' | '_' | '\'' -> scan_cont (j + 1)
| _ -> (j, String.sub s i (j - i)))
in
(match get i with
| 'A'..'Z' | 'a'..'z' | '_' -> scan_cont (i + 1)
| _ -> fail (i, i) "Expecting an identifier.")
in
let rec scan_longident_cont lidr i =
if get i <> '.' then (i, lidr) else
let j, idr = scan_ident (i + 1) in
scan_longident_cont (Longident.Ldot (lidr, idr)) j
in
let scan_longident i =
let j, idr = scan_ident i in
scan_longident_cont (Longident.Lident idr) j
in
let scan_ident = with_loc scan_ident in
let scan_longident = with_loc scan_longident in
let scan_longident_cont idr =
with_loc (scan_longident_cont (Longident.Lident idr)) in
let re_perl (i, j) =
let sij = String.sub s i (j - i) in
try ignore (Re.Perl.re sij); wrap_loc (i, j) (Code sij)
with Re.Perl.Parse_error | Re.Perl.Not_supported ->
fail (i, j) "Rejected by Re.Perl."
in
let scan_escape i =
if i + 1 = l then fail (i, i+1) "Escape at end of regular expression." else
(match s.[i + 1] with
| 'a'..'z' | 'A'..'Z' -> (i + 2, re_perl (i, i + 2))
| _ -> (i + 2, re_perl (i, i + 2)))
in
let rec scan_cset i j =
if j = l then fail (i, i + 1) "Unbalanced '['." else
(match s.[j] with
| '\\' ->
if j + 1 = l then
fail (j, j + 1) "Backslash at end of RE while scanning character set."
else
scan_cset i (j + 2)
| '[' when get (j + 1) = ':' ->
(match String.index_from s (j + 1) ']' with
| exception Not_found ->
fail (j + 1, j + 2) "Unbalanced '[' in character set."
| k -> scan_cset i (k + 1))
| ']' when j <> i + 1 && (j <> i + 2 || s.[i + 1] <> '^') ->
(j + 1, re_perl (i, j + 1))
| _ -> scan_cset i (j + 1))
in
let scan_int_opt i =
let rec loop i n =
if i = l then (i, n) else
(match s.[i] with
| '0'..'9' as ch -> loop (i + 1) (10 * n + (Char.code ch - 48))
| _ -> (i, n))
in
let j, n = loop i 0 in
(j, (if i = j then None else Some n))
in
let scan_range i =
let j, n_min = scan_int_opt i in
let n_min =
(match n_min with
| None -> fail (i, i) "Missing lower bound for range."
| Some n -> n) in
(match get j with
| ',' ->
let j, n_max = scan_int_opt (j + 1) in
(match n_max with
| Some n_max when n_max < n_min -> fail (i, j) "Reversed repeat range."
| _ -> ());
(j, n_min, n_max)
| _ ->
(j, n_min, (Some n_min)))
in
let apply_to_head (i, j) f = function
| [] -> fail (i, j) "Operator must follow an operand."
| e :: es -> f e :: es
in
let scan_greedyness i =
let j, greedyness =
(match get i with
| '?' -> (i + 1, suffix_loc (i + 1) (fun e -> Nongreedy e))
| '+' -> fail (i, i + 1) "Possessive modifier not supported."
| _ -> (i, (fun e -> e))) in
(match get j with
| '?' | '*' | '+' | '{' ->
fail (j, j + 1) "Nested repetition must be parenthesized."
| _ -> (j, greedyness))
in
let repeat (i, j) (n_min, n_max) =
suffix_loc j (fun e -> Repeat (wrap_loc (i, j) (n_min, n_max), e))
in
let
rec scan_alt i =
let j, e = scan_alt_item i [] in
(j, simplify_alt e)
and scan_alt_item i acc =
let j, e = scan_seq i in
(match get j with
| ')' -> (j, List.rev (e :: acc))
| '|' -> scan_alt_item (j + 1) (e :: acc)
| _ -> assert false)
and scan_seq i =
let j, e = scan_seq_item i [] in
(j, simplify_seq ~loc:(make_loc (i, j)) e)
and scan_seq_item i acc =
(match get i with
| ')' | '|' -> (i, List.rev acc)
| '[' ->
let j, e = scan_cset i (i + 1) in
scan_seq_item j (e :: acc)
| '?' ->
let j = i + 1 in
let f = suffix_loc j (fun e -> Opt e) in
let k, g = scan_greedyness j in
scan_seq_item k (apply_to_head (i, k) (g % f) acc)
| '*' ->
let j = i + 1 in
let f = repeat (i, j) (0, None) in
let k, g = scan_greedyness j in
scan_seq_item k (apply_to_head (i, k) (g % f) acc)
| '+' ->
let j = i + 1 in
let f = repeat (i, j) (1, None) in
let k, g = scan_greedyness j in
scan_seq_item k (apply_to_head (i, k) (g % f) acc)
| '{' ->
let j, n_min, n_max = scan_range (i + 1) in
if j = l || s.[j] <> '}' then fail (i, i + 1) "Unbalanced '{'." else
let f = repeat (i, j) (n_min, n_max) in
let k, g = scan_greedyness (j + 1) in
scan_seq_item k (apply_to_head (i, k) (g % f) acc)
| '(' ->
let j, e = scan_group (i + 1) in
if j = l || s.[j] <> ')' then fail (i, i + 1) "Unbalanced '('." else
scan_seq_item (j + 1) (wrap_loc (i, j + 1) e :: acc)
| '^' -> scan_seq_item (i + 1) (re_perl (i, i + 1) :: acc)
| '$' -> scan_seq_item (i + 1) (re_perl (i, i + 1) :: acc)
| '\\' ->
let j, e = scan_escape i in
scan_seq_item j (e :: acc)
| _ -> scan_seq_item (i + 1) (re_perl (i, i + 1) :: acc))
and scan_group i =
(match get i with
| '?' ->
if i + 1 = l then fail (i - 1, i) "Unbalanced '('." else
(match s.[i + 1] with
| '&' ->
let j, idr = scan_ident (i + 2) in
if get j = ':' then
let k, lidr = scan_longident (j + 1) in
(k, Capture_as (idr, wrap_loc (j + 1, k) (Call lidr)))
else
let k, lidr = scan_longident_cont idr.Location.txt j in
(k, Call lidr)
| '<' ->
let j, idr = scan_ident (i + 2) in
if get j <> '>' then fail (i, i + 1) "Unbalanced '<'." else
let k, e = with_loc scan_alt (j + 1) in
(k, Capture_as (idr, e))
| ':' ->
scan_alt (i + 2)
| '#' ->
(try (String.index_from s (i + 2) ')', Seq []) with
| Not_found -> fail (i - 1, i + 1) "Unterminated comment.")
| _ ->
fail (i, i + 2) "Invalid group modifier.")
| '+' -> let j, e = with_loc scan_alt (i + 1) in (j, Capture e)
| '*' | '{' -> fail (i, i + 1) "Invalid group modifier."
| _ -> scan_alt i)
in
let scan_toplevel i =
if get i = '?' && get (i + 1) = '<' then scan_group i else scan_alt i
in
let j, e = with_loc scan_toplevel 0 in
if j <> l then fail (j, j + 1) "Unbalanced ')'." else e