package owl

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file owl_nlp_utils.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# 1 "src/owl/nlp/owl_nlp_utils.ml"
(*
 * OWL - OCaml Scientific and Engineering Computing
 * Copyright (c) 2016-2017
 *   Ben Catterall <bpwc2@cam.ac.uk>
 *   Liang Wang <liang.wang@cl.cam.ac.uk>
 *)

(* some useful regular expressions *)

let regexp_split = Str.regexp "[ \t;,.'!?()’“”\\/&—\\-]+"

let _allocate_space x =
  Owl_log.info "allocate more space";
  let l = Array.length x in
  let y = Array.make l [||] in
  Array.append x y


let load_from_file ?stopwords f =
  Owl_log.info "load text corpus";
  let t =
    match stopwords with
    | Some t -> t
    | None   -> Hashtbl.create 2
  in
  let x = ref (Array.make (64 * 1024) [||]) in
  let c = ref 0 in
  let w = ref 0 in
  let h = open_in f in
  Fun.protect
    (fun () ->
      (try
         while true do
           if !c = Array.length !x - 1 then x := _allocate_space !x;
           let s =
             Str.split (Str.regexp " ") (input_line h)
             |> List.filter (fun w -> Hashtbl.mem t w = false)
             |> Array.of_list
           in
           !x.(!c) <- s;
           c := !c + 1;
           w := !w + Array.length s
         done
       with
      | End_of_file -> ());
      Owl_log.info "load %i docs, %i words" !c !w;
      Array.sub !x 0 !c)
    ~finally:(fun () -> close_in h)


let load_from_string ?stopwords s =
  let t =
    match stopwords with
    | Some t -> t
    | None   -> Hashtbl.create 2
  in
  Str.split (Str.regexp " ") s
  |> List.filter (fun w -> Hashtbl.mem t w = false)
  |> Array.of_list


let load_stopwords f =
  Owl_log.info "load stopwords";
  let x = Hashtbl.create (64 * 1024) in
  let h = open_in f in
  Fun.protect
    (fun () ->
      (try
         while true do
           let w = input_line h in
           if Hashtbl.mem x w = false then Hashtbl.add x w 0
         done
       with
      | End_of_file -> ());
      x)
    ~finally:(fun () -> close_in h)


(* return both word->index and index->word hashtbl *)
let build_vocabulary x =
  Owl_log.info "build up vocabulary";
  let w2i = Hashtbl.create (64 * 1024) in
  Array.iter
    (fun l ->
      Array.iter (fun w -> if Hashtbl.mem w2i w = false then Hashtbl.add w2i w 0) l)
    x;
  let y = Array.make (Hashtbl.length w2i) "" in
  let i = ref 0 in
  Hashtbl.iter
    (fun w _ ->
      y.(!i) <- w;
      i := !i + 1)
    w2i;
  Array.sort String.compare y;
  let i2w = Hashtbl.(create (length w2i)) in
  Hashtbl.reset w2i;
  Array.iteri
    (fun i w ->
      Hashtbl.add w2i w i;
      Hashtbl.add i2w i w)
    y;
  w2i, i2w


let tokenise dict data = Array.map (Hashtbl.find dict) data

let tokenise_all dict data = Array.map (Array.map (Hashtbl.find dict)) data

let save_vocabulary x f = Owl_io.marshal_to_file x f

let load_vocabulary f = Owl_io.marshal_from_file f

let save_lda_model m f =
  Owl_log.info "save LDA model";
  Owl_io.marshal_to_file m (f ^ ".model")


let load_lda_model f =
  Owl_log.info "load LDA model";
  Owl_io.marshal_from_file (f ^ ".model")


(* TODO: perform simple processing of the passed in string *)
let simple_process s = s

(* ends here *)
OCaml

Innovation. Community. Security.