Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file owl_dataframe.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819# 1 "src/base/misc/owl_dataframe.ml"(*
* OWL - OCaml Scientific and Engineering Computing
* Copyright (c) 2016-2020 Liang Wang <liang.wang@cl.cam.ac.uk>
*)openOwl_typestypeelt=|Boolofbool|Intofint|Floatoffloat|Stringofstring|Anytypeseries=|Bool_Seriesofboolarray|Int_Seriesofintarray|Float_Seriesoffloatarray|String_Seriesofstringarray|Any_Seriestypet={mutabledata:seriesarray;(* column-based table, each column is a time series *)mutablehead:(string,int)Hashtbl.t;(* head and index of each column, stored in a hash table *)mutableused:int;(* sise of the used buffer space *)mutablesize:int(* size of the allocated buffer *)}letunpack_bool=function|Boolx->x|_->raiseOwl_exception.NOT_SUPPORTEDletunpack_int=function|Intx->x|_->raiseOwl_exception.NOT_SUPPORTEDletunpack_float=function|Floatx->x|_->raiseOwl_exception.NOT_SUPPORTEDletunpack_string=function|Stringx->x|_->raiseOwl_exception.NOT_SUPPORTEDletunpack_bool_series=function|Bool_Seriesx->x|_->raiseOwl_exception.NOT_SUPPORTEDletunpack_int_series=function|Int_Seriesx->x|_->raiseOwl_exception.NOT_SUPPORTEDletunpack_float_series=function|Float_Seriesx->x|_->raiseOwl_exception.NOT_SUPPORTEDletunpack_string_series=function|String_Seriesx->x|_->raiseOwl_exception.NOT_SUPPORTEDletpack_boolx=Boolxletpack_intx=Intxletpack_floatx=Floatxletpack_stringx=Stringxletpack_bool_seriesx=Bool_Seriesxletpack_int_seriesx=Int_Seriesxletpack_float_seriesx=Float_Seriesxletpack_string_seriesx=String_Seriesxletallocate_spacedata=Array.(map(function|Bool_Seriesc->Bool_Series(appendc(copyc))|Int_Seriesc->Int_Series(appendc(copyc))|Float_Seriesc->Float_Series(appendc(copyc))|String_Seriesc->String_Series(appendc(copyc))|Any_Series->Any_Series)data)letset_elt_in_seriesxi=function|Boola->(unpack_bool_seriesx).(i)<-a|Inta->(unpack_int_seriesx).(i)<-a|Floata->(unpack_float_seriesx).(i)<-a|Stringa->(unpack_string_seriesx).(i)<-a|Any->()letget_elt_in_seriesxi=matchxwith|Bool_Seriesc->Boolc.(i)|Int_Seriesc->Intc.(i)|Float_Seriesc->Floatc.(i)|String_Seriesc->Stringc.(i)|Any_Series->Anyletinit_seriesn=function|Boola->Bool_Series(Array.makena)|Inta->Int_Series(Array.makena)|Floata->Float_Series(Array.makena)|Stringa->String_Series(Array.makena)|Any->Any_Seriesletresize_seriesn=function|Bool_Seriesc->Bool_Series(Owl_utils_array.resize~head:truetruenc)|Int_Seriesc->Int_Series(Owl_utils_array.resize~head:true0nc)|Float_Seriesc->Float_Series(Owl_utils_array.resize~head:true0.nc)|String_Seriesc->String_Series(Owl_utils_array.resize~head:true""nc)|Any_Series->Any_Seriesletappend_seriesxy=matchx,ywith|Bool_Seriesx,Bool_Seriesy->Bool_Series(Array.appendxy)|Int_Seriesx,Int_Seriesy->Int_Series(Array.appendxy)|Float_Seriesx,Float_Seriesy->Float_Series(Array.appendxy)|String_Seriesx,String_Seriesy->String_Series(Array.appendxy)|Any_Series,Any_Series->Any_Series|_->failwith"append_series: unsupported type"letlength_series=function|Bool_Seriesc->Array.lengthc|Int_Seriesc->Array.lengthc|Float_Seriesc->Array.lengthc|String_Seriesc->Array.lengthc|Any_Series->0letslice_seriesslice=function|Bool_Seriesc->Bool_Series(Owl_utils_array.get_sliceslicec)|Int_Seriesc->Int_Series(Owl_utils_array.get_sliceslicec)|Float_Seriesc->Float_Series(Owl_utils_array.get_sliceslicec)|String_Seriesc->String_Series(Owl_utils_array.get_sliceslicec)|Any_Series->Any_Seriesletargsort_series=function|Bool_Seriesc->Owl_utils_array.argsort~cmp:Stdlib.comparec|Int_Seriesc->Owl_utils_array.argsort~cmp:Stdlib.comparec|Float_Seriesc->Owl_utils_array.argsort~cmp:Stdlib.comparec|String_Seriesc->Owl_utils_array.argsort~cmp:Stdlib.comparec|Any_Series->[||]letmin_series=function|Bool_Seriesc->Owl_utils_array.min_i~cmp:Stdlib.comparec|Int_Seriesc->Owl_utils_array.min_i~cmp:Stdlib.comparec|Float_Seriesc->Owl_utils_array.min_i~cmp:Stdlib.comparec|String_Seriesc->Owl_utils_array.min_i~cmp:Stdlib.comparec|Any_Series->-1letmax_series=function|Bool_Seriesc->Owl_utils_array.max_i~cmp:Stdlib.comparec|Int_Seriesc->Owl_utils_array.max_i~cmp:Stdlib.comparec|Float_Seriesc->Owl_utils_array.max_i~cmp:Stdlib.comparec|String_Seriesc->Owl_utils_array.max_i~cmp:Stdlib.comparec|Any_Series->-1letremove_ith_elti=function|Bool_Seriesc->Bool_Series(Owl_utils_array.removeci)|Int_Seriesc->Int_Series(Owl_utils_array.removeci)|Float_Seriesc->Float_Series(Owl_utils_array.removeci)|String_Seriesc->String_Series(Owl_utils_array.removeci)|Any_Series->Any_Seriesletinsert_ith_eltie=function|Bool_Seriesc->Bool_Series(Owl_utils_array.insertc[|unpack_boole|]i)|Int_Seriesc->Int_Series(Owl_utils_array.insertc[|unpack_inte|]i)|Float_Seriesc->Float_Series(Owl_utils_array.insertc[|unpack_floate|]i)|String_Seriesc->String_Series(Owl_utils_array.insertc[|unpack_stringe|]i)|Any_Series->Any_Seriesletget_ith_elti=function|Bool_Seriesc->Boolc.(i)|Int_Seriesc->Intc.(i)|Float_Seriesc->Floatc.(i)|String_Seriesc->Stringc.(i)|Any_Series->Anyletelt_to_str=function|Boola->string_of_boola|Inta->string_of_inta|Floata->string_of_floata|Stringa->a|Any->""letseries_type_to_str=function|Bool_Series_c->"b"|Int_Series_c->"i"|Float_Series_c->"f"|String_Series_c->"s"|Any_Series->"a"letstr_to_elt_fun=function|"b"->funa->Bool(bool_of_stringa)|"i"->funa->Int(int_of_stringa)|"f"->funa->ifa=""thenFloatnanelseFloat(float_of_stringa)|"s"->funa->Stringa|_->failwith"str_to_elt_fun: unsupported type"letelt_array_to_seriestypx=matchtypwith|Bool_Series_->pack_bool_series(Array.mapunpack_boolx)|Int_Series_->pack_int_series(Array.mapunpack_intx)|Float_Series_->pack_float_series(Array.mapunpack_floatx)|String_Series_->pack_string_series(Array.mapunpack_stringx)|Any_Series->Any_Seriesletmake?datahead_names=letcol_num=Array.lengthhead_namesinlethead=Hashtbl.create64in(* check the head names are unique *)Array.iteri(funis->assert(Hashtbl.memheads=false);Hashtbl.addheadsi)head_names;letdata=matchdatawith|Somea->a|None->Array.makecol_numAny_Seriesinassert(Array.lengthdata=col_num);(* calculate the actual number of rows *)letsize=ifcol_num=0then0elselength_seriesdata.(0)inletused=sizein(* check all the series have the same length *)Array.iter(func->assert(length_seriesc=size))data;(* return the generated frame *){data;head;used;size}letcol_numx=Array.lengthx.dataletrow_numx=x.usedletshapex=row_numx,col_numxletnumelx=row_numx*col_numxlettypesx=Array.mapseries_type_to_strx.dataletappend_rowxrow=assert(col_numx=Array.lengthrow);ifx.size=0then(letn=16inx.data<-Array.map(init_seriesn)row;x.size<-n;x.used<-1)else(ifx.used=x.sizethen(x.data<-allocate_spacex.data;x.size<-length_seriesx.data.(0));Array.iteri(funia->set_elt_in_seriesx.data.(i)x.useda)row;x.used<-x.used+1)letappend_colxcolhead=letm,n=shapexinassert(m=length_seriescol);letcol=resize_seriesx.sizecolinHashtbl.addx.headheadn;x.data<-Array.appendx.data[|col|]letget_headsx=letkv=Hashtbl.fold(funkvacc->Array.appendacc[|k,v|])x.head[||]inArray.sort(funab->snda-sndb)kv;Array.mapfstkvletset_headsxhead_names=assert(Array.lengthhead_names=Array.lengthx.data);lethead=Hashtbl.create64inArray.iteri(funis->assert(Hashtbl.memheads=false);Hashtbl.addheadsi)head_names;x.head<-headletremove_rowxi=leti=Owl_utils_ndarray.adjust_indexi(row_numx)inletnew_data=Array.map(funs->remove_ith_eltis)x.datainx.data<-new_data;x.used<-x.used-1;x.size<-x.size-1letremove_colxj=letj=Owl_utils_ndarray.adjust_indexj(col_numx)inx.data<-Owl_utils_array.filteri(funi_->i<>j)x.data;letnew_head=Owl_utils_array.filteri(funi_->i<>j)(get_headsx)inset_headsxnew_headletinsert_rowxirow=leti=Owl_utils_ndarray.adjust_indexi(row_numx)inifx.size=0then(letn=16inx.data<-Array.map(init_seriesn)row;x.size<-n;x.used<-1)else(ifx.used=x.sizethen(x.data<-allocate_spacex.data;x.size<-length_seriesx.data.(0));letnew_data=Array.mapi(funjs->insert_ith_eltirow.(j)s)x.datainx.data<-new_data;x.used<-x.used+1;x.size<-x.size+1)letinsert_colxjcol_headcol=letj=Owl_utils_ndarray.adjust_indexj(col_numx)inletnew_data=Owl_utils_array.insertx.data[|col|]jinx.data<-new_data;letnew_head=Owl_utils_array.insert(get_headsx)[|col_head|]jinset_headsxnew_headletid_to_headxi=(get_headsx).(i)lethead_to_idxname=Hashtbl.findx.headnameletget_rowxi=Array.map(funy->get_elt_in_seriesyi)x.dataletget_colxj=matchx.data.(j)with|Bool_Seriesc->Bool_Series(Array.subc0x.used)|Int_Seriesc->Int_Series(Array.subc0x.used)|Float_Seriesc->Float_Series(Array.subc0x.used)|String_Seriesc->String_Series(Array.subc0x.used)|Any_Series->Any_Seriesletget_rowsxidx=Array.map(get_rowx)idxletget_colsxidx=Array.map(get_colx)idx(* TODO *)let_get_row_assoc_x_idx=raise(Owl_exception.NOT_IMPLEMENTED"owl_dataframe._get_row_assoc")letget_col_by_namexname=letj=Hashtbl.findx.headnameinget_colxjletget_cols_by_namexnames=Array.map(get_col_by_namex)namesletgetxij=matchx.data.(j)with|Bool_Seriesc->Boolc.(i)|Int_Seriesc->Intc.(i)|Float_Seriesc->Floatc.(i)|String_Seriesc->Stringc.(i)|Any_Series->Anyletsetxija=matchx.data.(j)with|Bool_Seriesc->c.(i)<-unpack_boola|Int_Seriesc->c.(i)<-unpack_inta|Float_Seriesc->c.(i)<-unpack_floata|String_Seriesc->c.(i)<-unpack_stringa|Any_Series->()letget_by_namexiname=letj=Hashtbl.findx.headnameingetxijletset_by_namexinamea=letj=Hashtbl.findx.headnameinsetxijaletto_colsx=x.dataletto_rowsx=letstack=Owl_utils.Stack.make()inletm=row_numxinfori=0tom-1doOwl_utils.Stack.pushstack(get_rowxi)done;Owl_utils.Stack.to_arraystackletcopyx=lethead=Hashtbl.copyx.headinletused=x.usedinletsize=x.sizeinletdata=Array.map(function|Bool_Seriesc->Bool_Series(Array.copyc)|Int_Seriesc->Int_Series(Array.copyc)|Float_Seriesc->Float_Series(Array.copyc)|String_Seriesc->String_Series(Array.copyc)|Any_Series->Any_Series)x.datain{data;head;used;size}letcopy_structx=lethead=Hashtbl.copyx.headinletused=0inletsize=0inletdata=Array.map(function|Bool_Series_c->Bool_Series[||]|Int_Series_c->Int_Series[||]|Float_Series_c->Float_Series[||]|String_Series_c->String_Series[||]|Any_Series->Any_Series)x.datain{data;head;used;size}letresetx=x.used<-0;x.size<-0;x.data<-Array.map(function|Bool_Series_c->Bool_Series[||]|Int_Series_c->Int_Series[||]|Float_Series_c->Float_Series[||]|String_Series_c->String_Series[||]|Any_Series->Any_Series)x.dataletconcat_horizontalxy=assert(row_numx=row_numy);lethead=Hashtbl.copyx.headinletcol_num_x=col_numxinHashtbl.iter(funkv->Hashtbl.addheadk(v+col_num_x))y.head;letcol_num_y=col_numyinletdata=Array.make(col_num_x+col_num_y)Any_Seriesinletsize=maxx.sizey.sizeinfori=0tocol_num_x-1dodata.(i)<-resize_seriessizex.data.(i)done;fori=0tocol_num_y-1dodata.(col_num_x+i)<-resize_seriessizey.data.(i)done;{data;head;used=x.used;size}letconcat_verticalxy=assert(col_numx=col_numy);lethead=Hashtbl.copyx.headinletused=x.used+y.usedinletdata=Array.make(col_numx)Any_Seriesinfori=0tocol_numx-1doletsx=get_colxiinletj=id_to_headxi|>head_to_idyinletsy=get_colyjindata.(i)<-append_seriessxsydone;{data;head;used;size=used}letiteri_rowfx=letm=row_numxinfori=0tom-1dofi(get_rowxi)doneletiter_rowfx=iteri_row(fun_row->frow)xletmapi_rowfx=lethead=Hashtbl.copyx.headinletused=0inletsize=0inletdata=Array.map(fun_->Any_Series)x.datainlety={data;head;used;size}initeri_row(funirow->append_rowy(firow))x;yletmap_rowfx=mapi_row(fun_row->frow)xletfilteri_rowfx=lethead=Hashtbl.copyx.headinletused=0inletsize=0inletdata=Array.map(fun_->Any_Series)x.datainlety={data;head;used;size}initeri_row(funirow->iffirow=truethenappend_rowyrow)x;yletfilter_rowfx=filteri_row(fun_row->frow)xletfilter_mapi_rowfx=lethead=Hashtbl.copyx.headinletused=0inletsize=0inletdata=Array.map(fun_->Any_Series)x.datainlety={data;head;used;size}initeri_row(funirow->matchfirowwith|Somer->append_rowyr|None->())x;yletfilter_map_rowfx=filter_mapi_row(fun_row->frow)xletget_sliceslicex=letslice=Array.(mapof_list(of_listslice))|>Array.map(funs->R_s)inletshp_x=[|row_numx;col_numx|]inlet_tmp0=Owl_base_slicing.check_slice_definitionsliceshp_xinletslice=Array.map(function|R_s->s|_->failwith"get_slice: unsupported")_tmp0inletname=Owl_utils_array.get_sliceslice.(1)(get_headsx)inletdata=Array.map(head_to_idx)name|>get_colsx|>Array.map(slice_seriesslice.(0))inletused=length_seriesdata.(0)inlethead=Hashtbl.create(Array.lengthname)inArray.iteri(funis->Hashtbl.addheadsi)name;{data;head;used;size=used}(* TODO *)let_set_slice_x=raise(Owl_exception.NOT_IMPLEMENTED"owl_dataframe._set_slice")letget_slice_by_nameslicex=letrow_slice=Array.of_list(fstslice)inletcol_slice=Array.of_list(sndslice)inletshp_x=[|row_numx;col_numx|]inletrefmt=Owl_base_slicing.check_slice_definition[|R_row_slice|]shp_xinletrow_slice=(function|R_s->s|_->failwith"get_slice: unsupported")refmt.(0)inletcol_slice=ifArray.lengthcol_slice=0thenget_headsxelsecol_sliceinletdata=Array.map(slice_seriesrow_slice)(get_cols_by_namexcol_slice)inletused=length_seriesdata.(0)inlethead=Hashtbl.create(Array.lengthcol_slice)inArray.iteri(funis->Hashtbl.addheadsi)col_slice;{data;head;used;size=used}(* TODO *)let_set_slice_by_name_x=raise(Owl_exception.NOT_IMPLEMENTED"owl_dataframe._set_slice_by_name")letheadnx=letm=row_numxinassert(n>0&&n<=m);get_slice[[0;n-1];[]]xlettailnx=letm=row_numxinassert(n>0&&n<=m);get_slice[[-n;-1];[]]xletmin_ixhead=letseries=get_col_by_namexheadinmin_seriesseriesletmax_ixhead=letseries=get_col_by_namexheadinmax_seriesseriesletsort?(inc=true)xhead=letseries=get_col_by_namexheadinletindices=argsort_seriesseriesinifinc=falsethenOwl_utils_array.reverseindices;lety=copy_structxinArray.iter(funi->get_rowxi|>append_rowy)indices;yletuniquexhead=letseries=get_col_by_namexheadinlets_size=length_seriesseriesinleth_size=max64(s_size/2)inlethtbl=Hashtbl.createh_sizeinletstack=Owl_utils_stack.make()infori=0tos_size-1doletk=get_ith_eltiseriesinifHashtbl.memhtblk=falsethen(Hashtbl.addhtblkNone;Owl_utils_stack.pushstackk)done;letelt_array=Owl_utils_stack.to_arraystackinelt_array_to_seriesserieselt_array(* FIXME: not finished ... *)let_join?on?howx=leton=matchonwith|Somea->head_to_idxa|None->0inlethow=matchhowwith|Somea->a|None->`Innerinon,how(* I/O functions *)letguess_separatorlines=letsep=[|',';' ';'\t';';';':';'|'|]in(* rank by dividing as many parts as possible *)lettmp=Array.map(func->letl=String.split_on_charclines.(0)inc,List.lengthl)sepin(* sort by decreasing order *)Array.sort(funab->sndb-snda)tmp;letsep=Array.mapfsttmpinletnot_sep=reftrueinletsep_idx=ref0inwhile!not_sep=truedoletc=sep.(!sep_idx)inletn=String.split_on_charclines.(0)|>List.lengthin(tryArray.iter(funline->letm=String.split_on_charcline|>List.lengthinifm<>nthenraiseOwl_exception.FOUND)lines;not_sep:=falsewith|_exn->());if!not_sep=truethensep_idx:=!sep_idx+1done;(* if cannot detect, return comma as default sep *)if!not_sep=falsethensep.(!sep_idx)else','letguess_typesseplines=(* Note: no need to add "s" since it is default type *)lettyp=[|"b";"i";"f"|]inletnum_lines=Array.lengthlinesin(* at least two lines because the first one will be dropped *)assert(num_lines>1);letnum_cols=lines.(0)|>String.trim|>String.split_on_charsep|>List.lengthin(* split into separate columns *)letstacks=Array.initnum_cols(fun_->Owl_utils_stack.make())inArray.iteri(funiline->ifi>0thenString.trimline|>String.split_on_charsep|>List.iteri(funic->Owl_utils_stack.pushstacks.(i)c))lines;letcols=Array.mapOwl_utils_stack.to_arraystacksin(* guess the types of columns *)Array.mapi(fun_icol->letguess_typ=ref"s"in(tryArray.iter(funcol_typ->lettyp_fun=str_to_elt_funcol_typinletwrong_guess=reffalsein(tryArray.iter(funx->lety=String.trimxintyp_funy|>ignore)colwith|_exn->wrong_guess:=true);if!wrong_guess=falsethen(guess_typ:=col_typ;raiseOwl_exception.FOUND))typwith|_exn->());!guess_typ)colsletof_csv?sep?head?typesfname=letlines=Owl_io.head100fnameinletsep=matchsepwith|Somea->a|None->guess_separatorlinesinlethead_i=0inlethead_names=matchheadwith|Somea->a|None->Owl_io.csv_head~sephead_ifnameinlettypes=matchtypeswith|Somea->a|None->guess_typesseplinesinassert(Array.lengthhead_names=Array.lengthtypes);letconvert_f=Array.mapstr_to_elt_funtypesinletdataframe=makehead_namesinletdropped_line=ref0inOwl_io.read_csv_proc~sep(funiline->tryifi<>head_ithen(letrow=Array.map2(funfa->fa)convert_flineinappend_rowdataframerow)with|_exn->dropped_line:=!dropped_line+1;Owl_log.warn"of_csv: fail to parse line#%i @ %s"ifname)fname;if!dropped_line>0thenOwl_log.warn"%i lines have been dropped."!dropped_line;dataframeletto_csv?sepxfname=letm,n=shapexin(* include heads as the first line *)letcsv=Array.make_matrix(m+1)n""incsv.(0)<-get_headsx;(* dump the data into the table *)fori=0tom-1doforj=0ton-1docsv.(i+1).(j)<-elt_to_str(getxij)donedone;Owl_io.write_csv?sepcsvfname(* let print x = Owl_pretty.pp_dataframe Format.std_formatter x *)let(.%())xidx=get_by_namex(fstidx)(sndidx)let(.%()<-)xidxa=set_by_namex(fstidx)(sndidx)alet(.?())xf=filter_rowfxlet(.?()<-)xfg=filter_map_row(funr->iffr=truethenSome(gr)elseNone)xlet(.$())xslice=get_slice_by_nameslicex(* TODO *)let[@warning"-32"](.$()<-)_x_idx_a=raise(Owl_exception.NOT_IMPLEMENTED"owl_dataframe.( .$( )<- )")(* ends here *)