Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file owl_nlp_tfidf.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295# 1 "src/owl/nlp/owl_nlp_tfidf.ml"(*
* OWL - an OCaml numerical library for scientific computing
* Copyright (c) 2016-2018 Liang Wang <liang.wang@cl.cam.ac.uk>
*)(** NLP: TFIDF module *)typetf_typ=|Binary|Count|Frequency|Log_normtypedf_typ=|Unary|Idf|Idf_Smoothtypet={mutableuri:string;(* file path of the model *)mutabletf_typ:tf_typ;(* function to calculate term freq *)mutabledf_typ:df_typ;(* function to calculate doc freq *)mutableoffset:intarray;(* record the offest each document *)mutabledoc_freq:floatarray;(* document frequency *)mutablecorpus:Owl_nlp_corpus.t;(* corpus type *)mutablehandle:in_channeloption;(* file descriptor of the tfidf *)}(* variouis types of TF and IDF fucntions *)letterm_freq=function|Binary->funtctn->1.|Count->funtctn->tc|Frequency->funtctn->tc/.tn|Log_norm->funtctn->1.+.logtcletdoc_freq=function|Unary->fundcnd->1.|Idf->fundcnd->log(nd/.dc)|Idf_Smooth->fundcnd->log(nd/.(1.+.dc))lettf_typ_string=function|Binary->"binary"|Count->"raw count"|Frequency->"frequency"|Log_norm->"log normalised count"letdf_typ_string=function|Unary->"unary"|Idf->"inverse frequency"|Idf_Smooth->"inverse frequency smooth"letcreatetf_typdf_typcorpus=letbase_uri=Owl_nlp_corpus.get_uricorpusin{uri=base_uri^".tfidf";tf_typ;df_typ;offset=[||];doc_freq=[||];corpus;handle=None;}letget_urim=m.uriletget_corpusm=m.corpusletlengthm=Array.lengthm.offset-1letvocab_lenm=m.corpus|>Owl_nlp_corpus.get_vocab|>Owl_nlp_vocabulary.lengthletget_handlem=matchm.handlewith|Somex->x|None->leth=m|>get_uri|>open_ininm.handle<-Someh;h(* calculate document frequency for a given word *)letdoc_count_ofmw=letv=Owl_nlp_corpus.get_vocabm.corpusinleti=Owl_nlp_vocabulary.word2indexvwinm.doc_freq.(i)(* count occurrency in all documents, for all words *)letdoc_countvocabfname=letn_w=Owl_nlp_vocabulary.lengthvocabinletd_f=Array.maken_w0.inlet_h=Hashtbl.create1024inletn_d=ref0inOwl_io.iteri_lines_of_marshal(funidoc->Hashtbl.clear_h;Array.iter(funw->matchHashtbl.mem_hwwith|true->()|false->Hashtbl.add_hw0)doc;Hashtbl.iter(funw_->d_f.(w)<-d_f.(w)+.1.)_h;n_d:=i;)fname;d_f,!n_d(* count the term occurrency in a document *)letterm_count_hdoc=Array.iter(funw->matchHashtbl.mem_hwwith|true->(leta=Hashtbl.find_hwinHashtbl.replace_hw(a+.1.))|false->Hashtbl.add_hw1.)doc(* make [x] a unit vector by dividing its l2norm *)letnormalisex=letc=Array.fold_left(funa(w,b)->a+.b*.b)0.x|>sqrtinArray.map(fun(w,b)->(w,b/.c))x(* build TF-IDF model from an empty model, m: empty tf-idf model *)let_build_withnormsorttf_fundf_funm=letvocab=Owl_nlp_corpus.get_vocabm.corpusinlettfile=Owl_nlp_corpus.get_tok_urim.corpusinletfname=m.uriinOwl_log.info"calculate document frequency ...";letd_f,n_d=doc_countvocabtfileinletn_d=Owl_nlp_corpus.lengthm.corpus|>float_of_intinm.doc_freq<-d_f;Owl_log.info"calculate tf-idf ...";letfo=open_outfnamein(* buffer for calculate term frequency *)let_h=Hashtbl.create1024in(* variable for tracking the offest in output model *)letoffset=Owl_utils.Stack.make()inOwl_utils.Stack.pushoffset0;Owl_io.iteri_lines_of_marshal(funidoc->(* first count terms in one doc *)term_count_hdoc;(* prepare temporary variables *)lettfs=Array.make(Hashtbl.length_h)(0,0.)inlettn=Array.lengthdoc|>float_of_intinletj=ref0in(* calculate tf-idf values *)Hashtbl.iter(funwtc->lettf_df=(tf_funtctn)*.(df_fund_f.(w)n_d)intfs.(!j)<-w,tf_df;j:=!j+1;)_h;(* check if we need to normalise *)lettfs=matchnormwith|true->normalisetfs|false->tfsin(* check if we need to sort term id in increasing order *)let_=matchsortwith|true->Array.sort(funab->Pervasives.compare(fsta)(fstb))tfs|false->()in(* save to file and update offset *)Marshal.to_channelfotfs[];Owl_utils.Stack.pushoffset(LargeFile.pos_outfo|>Int64.to_int);(* remember to clear the buffer *)Hashtbl.clear_h;)tfile;(* finished, clean up *)m.offset<-offset|>Owl_utils.Stack.to_array;close_outfoletbuild?(norm=false)?(sort=false)?(tf=Count)?(df=Idf)corpus=letm=createtfdfcorpusinlettf_fun=term_freqtfinletdf_fun=doc_freqdfin_build_withnormsorttf_fundf_funm;m(* random access and iteration function *)letnextm:(int*float)array=m|>get_handle|>Marshal.from_channelletnext_batch?(size=100)m=letbatch=Owl_utils.Stack.make()in(tryfori=0tosize-1dom|>next|>Owl_utils.Stack.pushbatchdonewithexn->());Owl_utils.Stack.to_arraybatchletiterifm=Owl_io.iteri_lines_of_marshalfm.uriletmapifm=Owl_io.mapi_lines_of_marshalfm.uriletgetmi:(int*float)array=letfh=open_inm.uriinseek_infhm.offset.(i);letdoc=Marshal.from_channelfhinclose_infh;docletreset_iteratorsm=let_reset_offset=function|Someh->seek_inh0|None->()in_reset_offsetm.handle(* convert a single document according to a given model *)letapplymdoc=(* FIXME *)letft_fd_fn_d=t_f*.log(n_d/.(1.+.d_f))inletn_d=Owl_nlp_corpus.lengthm.corpus|>float_of_intinletd_f=m.doc_freqinletdoc=Owl_nlp_corpus.tokenisem.corpusdocinlet_h=Hashtbl.create1024interm_count_hdoc;lettfs=Array.make(Hashtbl.length_h)(0,0.)inleti=ref0inHashtbl.iter(funwt_f->tfs.(!i)<-w,ft_fd_f.(w)n_d;i:=!i+1;)_h;tfs(* I/O functions *)letsavemf=m.corpus<-Owl_nlp_corpus.reduce_modelm.corpus;m.handle<-None;Owl_io.marshal_to_filemfletloadf:t=Owl_io.marshal_from_filefletto_stringm=Printf.sprintf"TfIdf model\n"^Printf.sprintf" uri : %s\n"m.uri^Printf.sprintf" tf_type : %s\n"(m.tf_typ|>tf_typ_string)^Printf.sprintf" df_type : %s\n"(m.df_typ|>df_typ_string)^Printf.sprintf" # of docs : %i\n"(lengthm)^Printf.sprintf" # of vocab : %i"(vocab_lenm)^""letprintm=m|>to_string|>print_endline(* experimental functions *)(* percentage of non-zero elements in doc-term matrix *)letdensitym=letn_d=lengthm|>float_of_intinletn_t=vocab_lenm|>float_of_intinletnnz=ref0initeri(fun__->nnz:=!nnz+1)m;(float_of_int!nnz)/.(n_d*.n_t)letdoc_to_veckmx=letv=Owl_dense.Ndarray.Generic.zerosk[|vocab_lenm|]inArray.iter(fun(i,a)->Owl_dense.Ndarray.Generic.setv[|i|]a)x;v(* calculate pairwise distance for the whole model, format (id,dist) *)letall_pairwise_distancetypmx=letdist_fun=Owl_nlp_similarity.distancetypinletl=mapi(funiy->i,dist_funxy)minArray.sort(funab->Pervasives.compare(snda)(sndb))l;l(* k-nearest neighbour, very slow due to linear search *)letnearest?(typ=Owl_nlp_similarity.Cosine)mxk=letl=all_pairwise_distancetypmxinArray.subl0k(* ends here *)