Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Source file owl_stats.ml
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785# 1 "src/owl/stats/owl_stats.ml"(*
* OWL - OCaml Scientific and Engineering Computing
* Copyright (c) 2016-2019 Liang Wang <liang.wang@cl.cam.ac.uk>
*)[@@@warning"-32"](** Random numbers and distributions *)includeOwl_stats_dist(* Randomisation function *)letshufflex=lety=Array.copyxinOwl_stats_extend.shuffley;yletchoosexk=assert(Array.lengthx>=k);lety=Array.makekx.(0)inOwl_stats_extend.choose~src:x~dst:y;yletsamplexk=lety=Array.makekx.(0)inOwl_stats_extend.sample~src:x~dst:y;y(* Basic statistical functions *)letsumx=Owl_stats_extend.sumxletmeanx=Owl_stats_extend.meanxlet_get_meanmx=matchmwith|Somea->a|None->meanxletvar?meanx=Owl_stats_extend.varx(_get_meanmeanx)letstd?meanx=Owl_stats_extend.stdx(_get_meanmeanx)letsem?meanx=lets=std?meanxinletn=float_of_int(Array.lengthx)ins/.(sqrtn)letabsdev?meanx=Owl_stats_extend.absdevx(_get_meanmeanx)letskew?mean?sdx=letm=_get_meanmeanxinlets=matchsdwith|Somea->a|None->std~mean:mxinOwl_stats_extend.skewxmsletkurtosis?mean?sdx=letm=_get_meanmeanxinlets=matchsdwith|Somea->a|None->std~mean:mxinOwl_stats_extend.kurtosisxms(* TODO: move to C code *)letcentral_moment=Owl_base_stats.central_momentletcorrcoefx0x1=assertArray.(lengthx0=lengthx1);Owl_stats_extend.corrcoefx0x1(* TODO: optimise *)letsort=Owl_base_stats.sortletargsort=Owl_base_stats.argsortlet_resolve_tiesnextd=function|`Average->float_of_intnext-.float_of_intd/.2.|`Min->float_of_int(next-d)|`Max->float_of_intnextletrank?(ties_strategy=`Average)vs=letn=Array.lengthvsinletorder=argsortvsinletranks=Array.maken0.inletd=ref0inbeginfori=0ton-1doifi==n-1||comparevs.(order.(i))vs.(order.(i+1))<>0thenlettie_rank=_resolve_ties(i+1)!dties_strategyinforj=i-!dtoidoranks.(order.(j))<-tie_rankdone;d:=0elseincrd(* Found a duplicate! *)done;end;ranksletautocorrelation?(lag=1)x=letn=Array.lengthxinlety=meanxinleta=ref0.infori=0to(n-lag-1)doa:=!a+.((x.(i)-.y)*.(x.(i+lag)-.y))done;letb=ref0.infori=0to(n-1)dob:=!b+.(x.(i)-.y)**2.done;(!a/.!b)letcov?m0?m1x0x1=assertArray.(lengthx0=lengthx1);letm0=_get_meanm0x0inletm1=_get_meanm1x1inOwl_stats_extend.covx0x1m0m1letconcordant=Owl_base_stats.concordantletdiscordant=Owl_base_stats.discordantletkendall_tau=Owl_base_stats.kendall_tauletspearman_rhox0x1=letr0=rankx0inletr1=rankx1inleta=covr0r1inletb=(stdr0)*.(stdr1)ina/.bletminmax_i=Owl_base_stats.minmax_iletmin_i=Owl_base_stats.min_iletmax_i=Owl_base_stats.max_iletmin=Owl_base_stats.minletmax=Owl_base_stats.maxletminmax=Owl_base_stats.minmaxtypehistogram=Owl_base_stats.histogramlethistogram=Owl_base_stats.histogramlethistogram_sorted=Owl_base_stats.histogram_sortedletnormalise=Owl_base_stats.normaliseletnormalise_density=Owl_base_stats.normalise_densityletpp_hist=Owl_base_stats.pp_histletecdfx=letx=sort~inc:truexinletn=Array.lengthxinletm=float_of_intninlety=ref[||]inletf=ref[||]inleti=ref0inletc=ref0.inwhile!i<ndoletj=ref!iinwhile(!j<n)&&(x.(!i)=x.(!j))doc:=!c+.1.;j:=!j+1done;y:=Array.append!y[|x.(!i)|];f:=Array.append!f[|!c/.m|];i:=!jdone;!y,!fletquantilex=letx=sort~inc:truexinfunp->ifp<0.||p>1.thenraise(Invalid_argument"Owl_stats.quantile: expected float between 0 and 1")elseOwl_stats_extend.quantilexpletpercentilexp=quantilex(p/.100.)letmedianx=percentilex50.letfirst_quartilex=percentilex25.letthird_quartilex=percentilex75.letinterquartile=Owl_base_stats.interquartileletz_score~mu~sigmax=Array.map(funy->(y-.mu)/.sigma)xlett_scorex=letmu=meanxinletsigma=stdxinz_score~mu~sigmaxletnormlise_pdfx=letc=Owl_stats_extend.sumxinArray.map(funx->x/.c)x(* TODO *)letcenterise_x=Noneletstandarderise_x=Noneletksdensity_x=None(* Hypothesis tests *)typetail=BothSide|RightSide|LeftSidetypehypothesis={reject:bool;p_value:float;score:float;}letmake_hypothesisrejectp_valuescore={reject;p_value;score}letz_test~mu~sigma?(alpha=0.05)?(side=BothSide)x=letn=float_of_int(Array.lengthx)inletz=(meanx-.mu)*.(sqrtn)/.sigmainletpl=gaussian_cdf~mu:0.~sigma:1.zinletpr=gaussian_sf~mu:0.~sigma:1.zinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishpzlett_test~mu?(alpha=0.05)?(side=BothSide)x=letn=float_of_int(Array.lengthx)inletm=meanxinlets=std~mean:mxinlett=(m-.mu)*.(sqrtn)/.sinletpl=t_cdf~df:(n-.1.)~loc:0.~scale:1.tinletpr=t_sf~df:(n-.1.)~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlett_test_paired?(alpha=0.05)?(side=BothSide)xy=letnx=float_of_int(Array.lengthx)inletny=float_of_int(Array.lengthy)inlet_=ifnx<>nythenfailwith"the sizes of two samples does not equal."inletd=Owl_utils.Array.map2i(fun_ab->a-.b)xyinletm=Owl_stats_extend.sumd/.nxinlett=m/.(sem~mean:md)inletpl=t_cdf~df:(nx-.1.)~loc:0.~scale:1.tinletpr=t_sf~df:(nx-.1.)~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlet_t_test2_equal_var~alpha~sidexy=letnx=float_of_int(Array.lengthx)inletny=float_of_int(Array.lengthy)inletxm=meanxinletym=meanyinletxs=stdxinletys=stdyinletv=nx+.ny-.2.inlett=(xm-.ym)/.(sqrt(((xs**2.)/.nx)+.((ys**2.)/.ny)))inletpl=t_cdf~df:v~loc:0.~scale:1.tinletpr=t_sf~df:v~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlet_t_test2_welche~alpha~sidexy=letnx=float_of_int(Array.lengthx)inletny=float_of_int(Array.lengthy)inletxm=meanxinletym=meanyinletxs=stdxinletys=stdyinletvx=nx-.1.inletvy=ny-.1.inletv=((((xs**2.)/.nx)+.((ys**2.)/.ny))**2.)/.((xs**4.)/.((vx*.(nx**2.)))+.((ys**4.)/.(vy*.(ny**2.))))inlett=(xm-.ym)/.(sqrt(((xs**2.)/.nx)+.((ys**2.)/.ny)))inletpl=t_cdf~df:v~loc:0.~scale:1.tinletpr=t_sf~df:v~loc:0.~scale:1.tinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishptlett_test_unpaired?(alpha=0.05)?(side=BothSide)?(equal_var=true)xy=matchequal_varwith|true->_t_test2_equal_var~alpha~sidexy|false->_t_test2_welche~alpha~sidexyletsmirnovne=letnn=int_of_float(floor((float_of_intn)*.(1.-.e)))inletrechelpersumvc=letevn=e+.(float_of_intv)/.(float_of_intn)inletsum'=sum+.c*.(evn**(float_of_int(v-1)))*.((1.-.evn)**(float_of_int(n-v)))inletc'=c*.(float_of_int(n-v))/.(float_of_int(v+1))inifv<=nnthenhelpersum'(v+1)c'elsesuminlethelper2()=letmaxlog=logmax_floatinletlngamma=Owl_maths_special.loggammainletlgamnp1=lngamma(1.+.float_of_intn)inletrechelper3sumv=letevn=e+.(float_of_intv)/.(float_of_intn)inletomevn=1.-.evninlett=lgamnp1-.lngamma(1.+.float_of_intv)-.lngamma(1.+.float_of_int(n+v))inletsum'=sum+.exptinifv<=nnthenifabs_floatomevn>0.&&t>~-.maxlogthenhelper3sum'(v+1)elsehelper3sum(v+1)elsesuminhelper30.0inifnot(n>0&&e>=0.&&e<=1.)thennanelseife=0.0then1.0elseifn<1013thene*.(helper0.01.)elsee*.(helper2())letkolmogorovy=letx=(-2.)*.y*.yinletrechelpersignsumr=lett=exp(x*.r*.r)inletsum'=sum+.sign*.tinletr'=r+.1.inletsign'=~-.signinift=0.0||(t/.sum'<=1.1e-16)thensum'elsehelpersign'sum'r'inify<1.1e-16then1.0else2.*.helper1.0.1.letks_test?(alpha=0.05)xf=letx'=sortxinletmaxpq=ifp>qthenpelseqinletn=Array.lengthx'inletnn=float_of_intninletfvals=Array.mapfx'inletg1iv=v-.(float_of_inti)/.nninletg2iv=(float_of_int(i+1))/.nn-.vinletd1=Array.fold_leftmax0.(Array.mapig1fvals)inletd2=Array.fold_leftmax0.(Array.mapig2fvals)inletd=maxd1d2inletpval=2.*.(smirnovnd)inletpval2=kolmogorov(d*.sqrtnn)inifn=0thenraiseOwl_exception.EMPTY_ARRAYelseifn>2666||pval2>0.8-.nn*.0.003thenmake_hypothesis(pval2<alpha)pval2delsemake_hypothesis(pval<alpha)pvaldletrecuniquesl=matchlwith|[]->[]|x::[]->x::[]|x1::x2::xs->ifx1=x2thenuniques(x2::xs)elsex1::(uniques(x2::xs))(* Compute the empirical CDF of a list of samples from the input
domain (sorted list of floats). The output is a list of length
equal to domain. Both inputs are assumed to be sorted. *)letempCdfdomainsamples=letreccountxsamples=matchsampleswith|[]->(0,samples)|y::ys->ifx=ythenlet(n,rest)=countxysin(n+1,rest)else(0,samples)inletrecaggregateaccumdomainsamples=matchdomainwith|[]->[]|x::xs->let(p,rest)=countxsamplesinletaccum'=accum+pinaccum'::(aggregateaccum'xsrest)inletn=float_of_int(List.lengthsamples)inleta=aggregate0domainsamplesinList.map(funx->(float_of_intx)/.n)aletks2_test?(alpha=0.05)xy=letn1=Array.lengthxinletn2=Array.lengthyinifn1=0||n2=0thenraiseOwl_exception.EMPTY_ARRAYelseletnn1=float_of_intn1inletnn2=float_of_intn2inletx'=Array.to_list(sortx)inlety'=Array.to_list(sorty)inletdomain=uniques(Array.to_list(sort(Array.concat[x;y])))inletxCdf=empCdfdomainx'inletyCdf=empCdfdomainy'inletdiffs=List.map2(funpq->abs_float(p-.q))xCdfyCdfinletmaxpq=ifp>qthenpelseqinletd=List.fold_leftmax0.diffsinleten=sqrt(nn1*.nn2/.(nn1+.nn2))inletpval=kolmogorov((en+.0.12+.0.11/.en)*.d)inmake_hypothesis(pval<alpha)pvaldletad_test_x=None(* Anderson-Darling test *)letdw_test_x=None(* Durbin-Watson test *)letjb_test?(alpha=0.05)x=(* Jarque-Bera test *)letn=float_of_int(Array.lengthx)inlets=skewxinletk=kurtosisxinletj=(n/.6.)*.((s**2.)+.(((k-.3.)**2.)/.4.))inletp=chi2_sf~df:2.jinleth=alpha>pinmake_hypothesishpjletvar_test?(alpha=0.05)?(side=BothSide)~variancex=letn=float_of_int(Array.lengthx)inletv=n-.1.inletk=v*.(varx)/.varianceinletpl=chi2_cdf~df:vkinletpr=chi2_sf~df:vkinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishpkletfisher_test?(alpha=0.05)?(side=BothSide)abcd=letcdf?(max_prob=1.)kn1n2t=letleft=Stdlib.max0(t-n2)inletright=matchmax_probwith|1.->k|_->Stdlib.minn1tinleteps=0.000000001inOwl_utils.range_foldleftright~f:(funaccx->letp=hypergeometric_pdf~good:n1~bad:n2~sample:txinif(p<max_prob)||(abs_float(p-.max_prob))<epsthenacc+.pelseacc)~init:0.0in(* let n = a + b + c + d in *)letprob=hypergeometric_pdf~good:(a+b)~bad:(c+d)~sample:(a+c)ainletoddsratio=((float_of_inta)*.(float_of_intd))/.((float_of_intb)*.(float_of_intc))inletp=matchsidewith|BothSide->cdfa(a+b)(c+d)(a+c)~max_prob:prob|RightSide->cdfb(b+a)(c+d)(b+d)|LeftSide->cdfa(a+b)(c+d)(a+c)inleth=alpha>pinmake_hypothesishpoddsratioletlillie_test_x=None(* Lilliefors test *)lettiecorrectrankvals=letranks_sort=sortrankvalsinletcounts=Owl_utils.count_dup(Array.to_listranks_sort)inletsize=(float_of_int(Array.lengthrankvals))inletnumerator=Array.fold_left(+)0(Array.of_list(List.map(fun(_x,y)->y*y*y-y)counts))inmatchsizewith|0.0->1.0|1.0->1.0|_->1.0-.(float_of_intnumerator)/.(size*.size*.size-.size)(* Mann–Whitney U test *)letmannwhitneyu?(alpha=0.05)?(side=BothSide)xy=letrecexact_aunm=ifu<0.then0.elseifu>=m*.(n-.m)thenfloat_of_int(Owl_maths.combination(int_of_floatn)(int_of_floatm))elseif(m=1.||(n-.m)=1.)thenu+.1.else((exact_au(n-.1.)m)+.(exact_a(u-.(n-.m))(n-.1.)(m-.1.)))inletn1=float_of_int(Array.lengthx)inletn2=float_of_int(Array.lengthy)inletranked=rank(Array.appendxy)inletrankx=Array.fold_left(+.)0.0(Array.subranked0(int_of_floatn1))inletu1=n1*.n2+.(n1*.(n1+.1.0))/.2.0-.rankxinletu2=n1*.n2-.u1inletasymptotic_v=lett=tiecorrectrankedinletsd=sqrt(t*.n1*.n2*.(n1+.n2+.1.0)/.12.0)inletmean=n1*.n2/.2.0inletbigu=matchsidewith|BothSide->Stdlib.maxu1u2|RightSide->u2|LeftSide->u1inletz=(bigu-.mean)/.sdinletp=matchsidewith|BothSide->2.0*.gaussian_sf~mu:0.~sigma:1.(abs_floatz)|_->gaussian_sf~mu:0.~sigma:1.zinleth=alpha>pinmake_hypothesishpu2inletexact_v=letbigu=matchsidewith|BothSide->Stdlib.minu1u2|RightSide->u1|LeftSide->u2inletp=letn=n1+.n2inletk=ifn1<n2thenn2elsen1inletz=(exact_abigu(n1+.n2)k)/.float_of_int(Owl_maths.combination(int_of_floatn)(int_of_floatk))inmatchsidewith|BothSide->2.*.z|_->zinleth=alpha>pinmake_hypothesishpu2inif(maxranked)=(n1+.n2)&&(max[|n1;n2|])<10.thenexact1elseasymptotic1(* wilcoxon paired *)letwilcoxon?(alpha=0.05)?(side=BothSide)xy=letd=Array.map2(funab->a-.b)xyinletd=Owl_utils.Array.filter(funa->a<>0.)dinletn=float_of_int(Array.lengthd)inletrankval=rank(Array.mapabs_floatd)inletrp=Array.map2(funab->(ifa>0.0then1.else0.)*.b)drankvalinletrm=Array.map2(funab->(ifa<0.0then1.else0.)*.b)drankvalinletrp=Array.fold_left(+.)0.rpinletrm=Array.fold_left(+.)0.rminlett=Stdlib.minrprminletasymptotic_v=letmn=n*.(n+.1.)*.0.25inletse=n*.(n+.1.)*.(2.*.n+.1.)inlett_correctionrankvals=letranks_sort=sortrankvalsinletcounts=Owl_utils.count_dup(Array.to_listranks_sort)in(* let size = (float_of_int (Array.length rankvals)) in *)Array.fold_left(+)0(Array.of_list(List.map(fun(_x,y)->y*y*y-y)counts))inletcorr=float_of_int(t_correctionrankval)inletse=sqrt((se-.0.5*.corr)/.24.)inletz=(t-.mn)/.seinletp=2.0*.gaussian_sf~mu:0.~sigma:1.(abs_floatz)inmatchsidewith|BothSide->p|RightSide->(1.-.p/.2.)|LeftSide->p/.2.inletexactv=letrecfwn=if(w=n*.(n+.1.)/.2.)||(w=0.&&n>=0.)then1.elseif(w<0.&&n>0.)||(w>0.&&n=0.)||(n<0.)then0.elsefw(n-.1.)+.f(w-.n)(n-.1.)inletn1=float_of_int(Array.lengthx)inletv=matchsidewith|RightSide->v-.1.|_->vinletp=ifv<0.then0.elseArray.fold_left(+.)0.(Owl_utils.Array.map(funi->f(float_of_inti)n1)(Owl_utils.Array.range0(int_of_floatv)))inmatchsidewith|BothSide->2.*.p/.(2.**n1)|RightSide->1.-.(p/.(2.**n1))|LeftSide->p/.(2.**n1)inletp=if(Array.lengthd)=(Array.lengthx)&&n<10.thenexacttelseasymptotic1inleth=alpha>pinmake_hypothesishptletruns_test?(alpha=0.05)?(side=BothSide)?vx=(* Run test for randomness *)letv=matchvwith|Somev->v|None->medianxinletn1,n2=ref0.,ref0.inletz=ref[||]inlet_=Array.iter(funy->ify>vthen(n1:=!n1+.1.;z:=Array.append!z[|1|])elseify<vthen(n2:=!n2+.1.;z:=Array.append!z[|-1|]))xinletr0=ref1.inlet_=fori=0toArray.length!z-2domatch(!z.(i)*!z.(i+1))<0with|true->r0:=!r0+.1.|false->()doneinletaa=2.*.!n1*.!n2inletbb=!n1+.!n2inletr1=aa/.bb+.1.inletsr=aa*.(aa-.bb)/.(bb*.bb*.(bb-.1.))inletz=(!r0-.r1)/.(sqrtsr)inletpl=gaussian_cdf~mu:0.~sigma:1.zinletpr=gaussian_sf~mu:0.~sigma:1.zinletp=matchsidewith|LeftSide->pl|RightSide->pr|BothSide->min[|pl;pr|]*.2.inleth=alpha>pinmake_hypothesishpzletcrosstab_x=None(* Cross-tabulation *)(* MCMC: Metropolis and Gibbs sampling *)letmetropolis_hastingsfpn=letstepsize=0.1in(* be careful about step size, try 0.01 *)leta,b=1000,10inlets=Array.makenpinfori=0toa+b*n-1doletp'=Array.map(funx->gaussian_rvs~mu:0.~sigma:stepsize+.x)pinlety,y'=fp,fp'inletp'=(ify'>=ythenp'elseifstd_uniform_rvs()<(y'/.y)thenp'elseArray.copyp)inArray.iteri(funix->p.(i)<-x)p';if(i>=a)&&(imodb=0)thens.((i-a)/b)<-(Array.copyp)done;sletgibbs_samplingfpn=leta,b=1000,10inletm=a+b*ninlets=Array.makenpinletc=Array.lengthpinfori=1tom-1doforj=0toc-1dop.(j)<-fpjdone;if(i>=a)&&(imodb=0)thens.((i-a)/b)<-(Array.copyp)done;slettukey_fences?(k=1.5)arr=letfirst_quartile=first_quartilearrinletthird_quartile=third_quartilearrinletoffset=k*.(third_quartile-.first_quartile)infirst_quartile-.offset,third_quartile+.offsetletgaussian_kde=Owl_base_stats.gaussian_kdelet_=(* init the internal state of PRNG *)Owl_stats_prng.self_init()(* ends here *)