package sexplib0

  1. Overview
  2. Docs

Source file raw_grammar.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
(** Representation of S-expression grammars *)

(** This module defines the representation of S-expression grammars produced by
    [@@deriving sexp_grammar].  It introduces an AST to represent these grammars and a
    notion of "group" to represent the grammars of a mutually recursive set of OCaml
    type declaration.

    The grammar for a given type expression can be constructed via: {[

      [%sexp_grammar: <type>]

    ]}

    {3 Goals and non-goals}

    Functionality goals: With post-processing, sexp grammars can be pretty-printed in a
    human-readable format and provides enough information to implement completion and
    validation tools.

    Performance goals: [@@deriving sexp_grammar] adds minimal overhead and introduces no
    toplevel side effect. The compiler can lift the vast majority of ASTs generated by
    [@@deriving sexp_grammar] as global constants. Common sub-grammars are usually shared,
    particularly when they derive from multiple applications of the same functor.

    Non-goals: Stability, although we will make changes backwards-compatible or at least
    provide a reasonable upgrade path.

    In what follows, we describe how this is achieved.

    {3 Encoding of generated grammars to maximize sharing}

    A [group] contains the grammars for all types of a mutually recursive group of OCaml
    type declarations.

    To ensure maximum sharing, a group is split into two parts:

    - The [generic_group] depends only on the textual type declarations. Where the type
      declaration refers to an existing concrete type, the generic group takes a variable
      to represent the grammar of that type. This means that the compiler can lift each
      type declaration in the source code to a shared global constant.

    - The [group] binds the type variables of the [generic_group], either to concrete
      grammars where the type declaration refers to a concrete type, or to another
      variable where the type declaration itself was polymorphic.

    To understand this point better, imagine the following type declaration {[

      type t = X of u

    ]} were explicitly split into its [generic_group] and [group] parts: {[

      type 'u t_generic = X of 'u
      type t = u t_generic

    ]}

    If [u] came from a functor argument, it's easy to see that [t_generic] would be
    exactly the same in all applications of the functor and only [t] would vary. The
    grammar of [t_generic], which is the biggest part, would be shared between all
    applications of the functor.

    {3 Processing of grammars}

    The [Raw_grammar.t] type optimizes for performance over ease of use. To help users
    process the raw grammars into a more usable form, we keep two identifiers in the
    generated grammars:

    - The [generic_group_id] uniquely identifies a [generic_group]. It is a hash of the
      generic group itself. (It is okay that this scheme would conflate identical type
      declarations, because the resulting generic groups would be identical as well.)

    - The [group_id] uniquely identifies a [group]. It is a unique integer, generated
      lazily so that we don't create a side effect at module creation time.

    The exact processing would depend on the final application. We expect that a typical
    consumer of sexp grammars would define less-indirected equivalents of the [t] and
    [group] types, possibly re-using the [_ type_] and [Atom.t] types.
*)

(** The label of a field, constructor, or constant. *)
type label = string

type generic_group_id = string
type group_id = Lazy_group_id.t

(** Variable names. These are used to improve readability of the printed grammars.
    Internally, we use numerical indices to represent variables; see [Implicit_var]
    below. *)
type var_name = string

type type_name = string

(** A grammatical type which classifies atoms. *)
module Atom = struct
  type t =
    | String  (** Any atom. *)
    | Bool  (** One of [true], [false], [True], or [False]. *)
    | Char  (** A single-character atom. *)
    | Float  (** An atom which parses as a {!float}. *)
    | Int  (** An atom which parses as an integer, such as {!int} or {!int64}. *)
    | This of { ignore_capitalization : bool; string : string }
    (** Exactly that string, possibly modulo case in the first character. *)
end

(** A grammatical type which classifies sexps. Corresponds to a non-terminal in a
    context-free grammar. *)
type 't type_ =
  | Any  (** Any list or atom. *)
  | Apply of 't type_ * 't type_ list  (** Assign types to (explicit) type variables. *)
  | Atom of Atom.t  (** An atom, in particular one of the given {!Atom.t}. *)
  | Explicit_bind of var_name list * 't type_
  (** In [Bind ([ "a"; "b" ], Explicit_var 0)], [Explicit_var 0] is ["a"]. One must bind
      all available type variables: free variables are not permitted. *)
  | Explicit_var of int
  (** Indices for type variables, e.g. ['a], introduced by polymorphic definitions.

      Unlike de Bruijn indices, these are always bound by the nearest ancestral
      [Explicit_bind]. *)
  | Grammar of 't  (** Embeds other types in a grammar. *)
  | Implicit_var of int
  (** Indices for type constructors, e.g. [int], in scope. Unlike de Bruijn indices, these
      are always bound by the [implicit_vars] of the nearest enclosing [generic_groups].
  *)
  | List of 't sequence_type
  (** A list of a certain form. Depending on the {!sequence_type}, this might
      correspond to an OCaml tuple, list, or embedded record. *)
  | Option of 't type_
  (** An optional value. Either syntax recognized by [option_of_sexp] is supported:
      [(Some 42)] or [(42)] for a value and [None] or [()] for no value. *)
  | Record of 't record_type
  (** A list of lists, representing a record of the given {!record_type}. For
      validation, [Record recty] is equivalent to [List [Fields recty]]. *)
  | Recursive of type_name
  (** A type in the same mutually recursive group, possibly the current one. *)
  | Union of 't type_ list
  (** Any sexp matching any of the given types. {!Variant} should be preferred when
      possible, especially for complex types, since validation and other algorithms may
      behave exponentially.

      One useful special case is [Union []], the empty type. This is occasionally
      generated for things such as abstract types. *)
  | Variant of 't variant_type  (** A sexp which matches the given {!variant_type}. *)

(** A grammatical type which classifies sequences of sexps. Here, a "sequence" may mean
    either a list on its own or, say, the sexps following a constructor in a list
    matching a {!variant_type}.

    Certain operations may greatly favor simple sequence types. For example, matching
    [List [ Many type_ ]] is easy for any type [type_] (assuming [type_] itself is
    easy), but [List [ Many type1; Many type2 ]] may require backtracking. Grammars
    derived from OCaml types will only have "nice" sequence types. *)
and 't sequence_type = 't component list

(** Part of a sequence of sexps. *)
and 't component =
  | One of 't type_  (** Exactly one sexp of the given type. *)
  | Optional of 't type_  (** One sexp of the given type, or nothing at all. *)
  | Many of 't type_  (** Any number of sexps, each of the given type. *)
  | Fields of 't record_type
  (** A succession of lists, collectively defining a record of the given {!record_type}.
      The fields may appear in any order. The number of lists is not necessarily fixed,
      as some fields may be optional. In particular, if all fields are optional, there
      may be zero lists. *)

(** A tagged union of grammatical types. Grammars derived from OCaml variants will have
    variant types. *)
and 't variant_type =
  { ignore_capitalization : bool
  (** If true, the grammar is insensitive to the case of the first letter of the label.
      This matches the behavior of derived [sexp_of_t] functions. *)
  ; alts : (label * 't sequence_type) list
  (** An association list of labels (constructors) to sequence types. A matching sexp is
      a list whose head is the label as an atom and whose tail matches the given
      sequence type. As a special case, an alternative whose sequence is empty matches
      an atom rather than a list (i.e., [label] rather than [(label)]). This is in
      keeping with generated [t_of_sexp] functions.

      As a workaround, to match [(label)] one could use
      [("label", [ Optional (Union []) ])]. *)
  }

(** A collection of field definitions specifying a record type. Consists only of an
    association list from labels to fields. *)
and 't record_type =
  { allow_extra_fields: bool
  ; fields: (label * 't field) list
  }

(** A field in a record. *)
and 't field =
  { optional : bool  (** If true, the field is optional. *)
  ; args : 't sequence_type
  (** A sequence type which the arguments to the field must match. An empty sequence is
      permissible but would not be generated for any OCaml type. *)
  }

type t =
  | Ref of type_name * group
  | Inline of t type_

and group =
  { gid : group_id
  ; generic_group : generic_group
  ; origin : string
  (** [origin] provides a human-readable hint as to where the type was defined.

      For a globally unique identifier, use [gid] instead.

      See [ppx/ppx_sexp_conv/test/expect/test_origin.ml] for examples. *)
  ; apply_implicit : t list
  }

and generic_group =
  { implicit_vars : var_name list
  ; ggid : generic_group_id
  ; types : (type_name * t type_) list
  }
OCaml

Innovation. Community. Security.