package metadata

  1. Overview
  2. Docs

Source file metadataCharEncoding.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
type recode =
  ?source:[ `ISO_8859_1 | `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
  ?target:[ `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
  string ->
  string

module type T = sig
  val convert :
    ?source:[ `ISO_8859_1 | `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
    ?target:[ `UTF_8 | `UTF_16 | `UTF_16LE | `UTF_16BE ] ->
    string ->
    string
end

module Naive : T = struct
  let convert ?(source = `UTF_8) ?(target = `UTF_8) s =
    match (source, target) with
      | `UTF_8, `UTF_8
      | `UTF_16, `UTF_16
      | `UTF_16LE, `UTF_16LE
      | `UTF_16BE, `UTF_16BE
      | _ ->
          let buf = Buffer.create 10 in
          let len = String.length s in
          let add_unicode_char =
            match target with
              | `UTF_8 -> Buffer.add_utf_8_uchar
              | `UTF_16BE -> Buffer.add_utf_16be_uchar
              | _ -> Buffer.add_utf_16le_uchar
          in
          let unicode_byte_length =
            match source with
              | `ISO_8859_1 -> fun _ -> 1
              | `UTF_8 -> Uchar.utf_8_byte_length
              | _ -> Uchar.utf_16_byte_length
          in
          let s, get_unicode_char =
            match source with
              | `ISO_8859_1 -> (s, fun s pos -> Uchar.of_char s.[pos])
              | `UTF_8 ->
                  ( s,
                    fun s pos ->
                      Uchar.utf_decode_uchar (String.get_utf_8_uchar s pos) )
              | `UTF_16BE ->
                  ( s,
                    fun s pos ->
                      Uchar.utf_decode_uchar (String.get_utf_16be_uchar s pos)
                  )
              | `UTF_16LE ->
                  ( s,
                    fun s pos ->
                      Uchar.utf_decode_uchar (String.get_utf_16le_uchar s pos)
                  )
              | `UTF_16 ->
                  let default =
                    ( "",
                      fun s pos ->
                        Uchar.utf_decode_uchar (String.get_utf_16be_uchar s pos)
                    )
                  in
                  if len < 2 then default
                  else (
                    let rem = String.sub s 2 (len - 2) in
                    match (s.[0], s.[1]) with
                      | '\xfe', '\xff' ->
                          ( rem,
                            fun s pos ->
                              Uchar.utf_decode_uchar
                                (String.get_utf_16be_uchar s pos) )
                      | '\xff', '\xfe' ->
                          ( rem,
                            fun s pos ->
                              Uchar.utf_decode_uchar
                                (String.get_utf_16le_uchar s pos) )
                      | _ -> default)
          in
          if target = `UTF_16 then add_unicode_char buf Uchar.bom;
          let len = String.length s in
          let rec f pos =
            if pos = len then Buffer.contents buf
            else (
              let c = get_unicode_char s pos in
              add_unicode_char buf c;
              f (pos + unicode_byte_length c))
          in
          f 0
end
OCaml

Innovation. Community. Security.