/*-----------------------------------------------------------*/
/*  Text Mining Primitives (vector-space model, clustering)  */
/*-----------------------------------------------------------*/
/*  (C) 2004 Zdravko Markov                                  */
/*-----------------------------------------------------------*/

/*-----------------------------------------------------------*/
/*  Vector Space Model                                       */
/*-----------------------------------------------------------*/
/*-----------------------------------------------------------*/
/*  vectors(+Files,+IDF,-Vectors)                            */
/*   Generates a TFIDF vector for each document in Files     */
/*-----------------------------------------------------------*/
/*  binvectors(+Files,+IDF,-Vectors)                         */
/*   Generates a binary vector for each document in Files    */
/*-----------------------------------------------------------*/

vectors([],_,[]) :- !.
vectors([F-C|T],IDF,[F-C-V|L]) :- !,
    vector(F,IDF,V), !,
    vectors(T,IDF,L).
vectors([F|T],IDF,[F-V|L]) :-
    vector(F,IDF,V), !,
    vectors(T,IDF,L).

vector(File,IDF,NVector) :-
    see(File),
    tokenize(WL),
    seen,
    termvalues(IDF,WL,Vector), 
    norm(Vector,Norm), !,
    (Norm \== 0, normalize(Vector,Norm,NVector);
     NVector = Vector), !.

binvectors([],_,[]) :- !.
binvectors([F-C|T],IDF,[F-C-V|L]) :- !,
    binvector(F,IDF,V), !,
    binvectors(T,IDF,L).
binvectors([F|T],IDF,[F-V|L]) :-
    binvector(F,IDF,V), !,
    binvectors(T,IDF,L).

binvector(File,IDF,Vector) :-
    see(File),
    tokenize(WL),
    seen,
    binvalues(IDF,WL,Vector).

/*-----------------------------------------------------------*/
/*  idf(+Files,+Terms,+N,-IDFN)                              */
/*   Returns the top N terms according to their IDF in Files */
/*-----------------------------------------------------------*/

idf(Files,Terms,N,IDFN) :-
    idfcount(Files,Terms,IDF1), !,
    keysort(IDF1,IDF2),
    length(Files,D),
    calc_idf(IDF2,D,IDF),
    firstn(N,IDF,IDFN).

idfcount([],IDF,IDF) :- !.
idfcount([F|T],IDF,IDF2) :-
    see(F),
    tokenize(WL),
    seen,
    update_idf(IDF,WL,IDF1),
    idfcount(T,IDF1,IDF2).

/*-----------------------------------------------------------*/
/*  tf(+Files,+N,-Terms)                                     */
/*   Returns the most frequent N terms in Files              */
/*-----------------------------------------------------------*/

tf(Files,N,Terms) :-
    corpus(Files,WL),
    wfreq(WL,FL),
    firstn1(N,FL,Terms).
    
/*-----------------------------------------------------------*/
/*  class(+Files,+Classes,-LabeledFiles)                     */
/*   Labels files with classes                               */
/*-----------------------------------------------------------*/

class([],_,[]) :- !.
class([X|T],CL,[X-C|V]) :-
    member(C-L,CL),
    memberchk(X,L), !,
    class(T,CL,V).

/*-----------------------------------------------------------*/
/*  arff(+Terms,+Vectors,-File)                              */
/*   Converts Vectors into WEKA ARFF format                  */
/*-----------------------------------------------------------*/

arff(Terms,Vectors,File) :-
    tell(File),
    write('@relation '),writeln(File),nl,
    writeln('@attribute id string'),
    printattr(Terms),
    printclassattr(Vectors),nl,
    writeln('@data'),
    printdata(Vectors),
    told.

printattr([]) :- !.
printattr([_-A|T]) :- 
    write('@attribute '),write(A),tab(1),writeln('numeric'),
    printattr(T).

printclassattr([X-C-D|T]) :- !,
    setof(Cl,A^B^member(A-Cl-B,[X-C-D|T]),Cs),
    write('@attribute class '), 
    write('{'), printrow(Cs), writeln('}').
printclassattr(_).

printdata([]) :- !.
printdata([X-C-D|T]) :- !,
    printrow([X|D]),write(','),writeln(C),
    printdata(T).
printdata([X-D|T]) :- !,
    printrow([X|D]),nl,
    printdata(T).

printrow([X]) :- write(X), !.
printrow([X|T]) :-
    write(X),write(','),
    printrow(T).

/*-----------------------------------------------------------*/
/*  Agglomerative Hierarchical Clustering                    */
/*-----------------------------------------------------------*/
/*-----------------------------------------------------------*/
/*  cluster(+E, -Quality)                                    */
/*   E = [x1-[v11,v12,...],x2-[v21,v22,...],...]             */
/*   Quality is the average similarity between all clusters  */
/*-----------------------------------------------------------*/

cluster(E,Quality) :- 
    cluster(E,Clustering,Quality),
    show(Clustering).

cluster(E,Clustering,Quality) :- 
    cluster(E,Clustering,D,N),
    Quality is D/N.

cluster(E,Clustering,Q,N) :- 
    findall([A,B]/D,(pair(E,A,B),similarity(A,B,D)),All),
    maximum(All,[A-X,B-Y]/D), 
    del(A-X,E,E1),
    del(B-Y,E1,E2),
    centroid([X,Y],C),
    cluster([[A,B]-C|E2],Clustering,Q1,N1), !,
    Q is Q1 + D,
    N is N1 + 1.
cluster([C-_],C,0,0).

similarity(_-X,_-Y,D) :-
    dot(X,Y,D), !.

centroid(L,C) :- 
   sum(L,S),
   length(L,N),
   divide(S,N,C1),
   norm(C1,Norm),
   normalize(C1,Norm,C).   

dot([],[],0) :- !.
dot([X|T],[Y|V],D) :-
    dot(T,V,P),
    D is P+X*Y.

/*-----------------------------------------------------------*/
/*  show(+Clustering)                                        */
/*   Prints clustering (nested list) as a horizontal tree    */
/*-----------------------------------------------------------*/

show(C) :- 
    show(C,0).

show([[H|T]],P) :- !,
    show([H|T],P).
show([X|Y],P) :- 
    (distribution([X|Y],D),entropy([X|Y],H),Node=D-H;
     Node = +), !,         % use entropy if labeled
    tab(P), writeln(Node),
    P1 is P+2,
    show(X,P1),
    showl(Y,P1).
show(X,P) :-
    tab(P),!,
    writeln(X).

showl([],_) :- !.
showl([X|T],P) :-
    show(X,P),
    showl(T,P).

/*-----------------------------------------------------------*/
/*  entropy(+Clustering,-Entropy)                            */
/*   Computes entropy in a labeled clustering                */
/*-----------------------------------------------------------*/

entropy(L,I) :-
    flatten(L,FL),
    distribution(FL,D),
    length(FL,N),
    sumlogs(D,N,I).

sumlogs([],_,0) :- !.
sumlogs([X-_|T],N,S) :-
    sumlogs(T,N,S1),
    S is S1-(X/N)*log(X/N)/log(2).

/*-----------------------------------------------------------*/
/*  distribution(+Clustering,-Distribution)                  */
/*   Computes the distribution of class labels in Clustering */
/*-----------------------------------------------------------*/

distribution(L,D) :-
    flatten(L,FL),
    setof(C,X^member(X-C,FL),CL),
    distr(CL,FL,D).

distr([],_,[]) :- !.
distr([C|T],L,[D-C|V]) :-
    findall(C,X^member(X-C,L),W),
    length(W,D), !,
    distr(T,L,V).

/*-----------------------------------------------------------*/
/*  K-Nearest Neighbor Algorithm                             */
/*-----------------------------------------------------------*/
/*  knn: K-nearest neighbor                                  */
/*  knnw: Distance-weighted nearest neighbor                 */
/*-----------------------------------------------------------*/

knn(X,K,Examples,Class) :-
    neighbors(X,K,Examples,Neighbors),
    sumv(Neighbors,Sum),
    max(Sum,_-Class).

knnw(X,K,Examples,Class) :-
    neighbors(X,K,Examples,Neighbors),
    sumw(Neighbors,Sum),
    max(Sum,_-Class).

neighbors(X,K,Examples,Neighbors) :-
    findall(S-C,(member(Id-C-E,Examples),similarity(X,Id-C-E,S)),Ss),
    keysort(Ss,L),
    reverse(L,RL),
    firstn(K,RL,Neighbors).

sumv([],[]).
sumv([_-C|T],[N-C|R]) :-
    delc(C,T,V,N),
    sumv(V,R).

delc(_,[],[],1).
delc(X,[_-X|T],V,N) :- !,
    delc(X,T,V,M),
    N is M+1.
delc(X,[Y|T],[Y|V],N) :-
    delc(X,T,V,N).

sumw([],[]).
sumw([D-C|T],[S-C|R]) :-
    delcw(C,T,V,S1),
    S is S1+D,
    sumw(V,R).

delcw(_,[],[],0).
delcw(X,[D-X|T],V,S) :- !,
    delcw(X,T,V,S1),
    S is S1+D.
delcw(X,[Y|T],[Y|V],N) :-
    delcw(X,T,V,N).

/*-----------------------------------------------------------*/
/*  Naive Bayes Classifier (discrete values)                 */
/*-----------------------------------------------------------*/
/*  Call: bayes(+TestData, +Examples, -Classification).      */
/*-----------------------------------------------------------*/

bayes(X,Examples,Class) :-
    probs(X,Examples,Probs),
    max(Probs,_-Class).

probs(X,Examples,Probs) :-
    setof(C,N^E^member(N-C-E,Examples),Cs),
    findall(P-C,(member(C,Cs),
                 cond_prob(X,C,Examples,PL),
                 class_prob(C,Examples,PC),
                 mult(PL,PS),
                 P is PS*PC),Probs).


cond_prob(X,C,Examples,PL) :-
    cond_prob(1,X,C,Examples,PL).

cond_prob(_,[],_,_,[]) :- !.
cond_prob(I,[X|T],C,Examples,[P|PL]) :-
    findall(V,(member(_-C-E,Examples),nth1(I,E,V)),L),
    length(L,N),
    findall(X,member(X,L),O),
    length(O,F),
    P is F/N,
    J is I+1,
    cond_prob(J,T,C,Examples,PL).

class_prob(C,Examples,PC) :-
    findall(I,member(I-_,Examples),All),
    length(All,N),
    findall(I,member(I-C-_,Examples),W),
    length(W,M),
    PC is M/N.
    
mult([],1) :- !.
mult([P|L],Ps) :-
    mult(L,P1),
    Ps is P1*P.

/*---------------------------------------------------------*/
/* BELIEF NETWORK INTERPRETER                              */
/* Computational Intelligence: a logical approach.         */
/* Copyright (c) 1998, Poole, Mackworth, Goebel            */
/*                     and Oxford University Press.        */
/*---------------------------------------------------------*/
/* A belief network is represented with the relations
   variables(Xs) Xs is the list of random variables.
   Xs is ordered: parents of node are before the node.
   parents(X,Ps) Ps list of parents of variable X.
   Ps is ordered consistently with Xs
   values(X,Vs) Vs is the list of values of X
   pr(X,As,D) X is a variable, As is a list of Pi=Vi where
   Pi is a parent of X, and Vi is a value for variable Pi
   The elements of As are ordered consistently with Ps.    */
/*---------------------------------------------------------*/
/*---------------------------------------------------------*/
/* p(Var,Obs,Dist) is true if Dist represents the
   probability distribution of P(Var|Obs)
   where Obs is a list of Vari=Vali. Var is not observed.  */
/*---------------------------------------------------------*/
p(Var,Obs,VDist) :-
   relevant(Var,Obs,RelVars),
   to_sum_out(RelVars,Var,Obs,SO),
   joint(RelVars,Obs,Joint),
   sum_out_each(SO,Joint,Dist),
   collectt(Dist,DT0),
   normalize(DT0,0,_,VDist).

/*---------------------------------------------------------*/
/* relevant(Var,Obs,RelVars) Relvars is the relevant
   variables given query Var and observations Obs.
   This is the most conservative.                          */
/*---------------------------------------------------------*/
relevant(_,_,Vs) :-
   variables(Vs).  

/*---------------------------------------------------------*/
/* to_sum_out(Vs,Var,Obs,SO), 
    Given all variables Vs, query variable Var
  and observations Obs, S0 specifies the elimination
  ordering. Here, naively, the elimination ordering
  is the same as variable ordering                         */
/*---------------------------------------------------------*/
to_sum_out(Vs,Var,Obs,SO) :-
   remove(Var,Vs,RVs),
   remove_each_obs(Obs,RVs,SO).

/*---------------------------------------------------------*/
/* remove_each_obs(Obs,RVs,SO) removes each of the
  observation variables from RVs resulting in SO.          */
/*---------------------------------------------------------*/
remove_each_obs([],SO,SO) :- !.
remove_each_obs([X=_|Os],Vs0,SO) :-
   remove_if_present(X,Vs0,Vs1),
   remove_each_obs(Os,Vs1,SO).

/*---------------------------------------------------------*/
/* A joint probability distribution is represented
as a list of distribution trees, of the form
         dtree(Vars,DTree) 
where Vars is a list of Variables (ordered
consistently with the ordering of variables), and
DTree is tree representation for the function from
values of variables into numbers such that if
Vars=[] then DTree is a number. Otherwise
Vars=[Var|RVars], and DTree is a list with one
element for each value of Var, and each element
is a tree representation for RVars. The ordering
of the elements in DTree is given by the ordering
of Vals given by values(Var,Vals).                         */

/*---------------------------------------------------------*/
/* joint(Vs,Obs,Joint) Vs is a list of variables,
   Obs is an observation list returns a list of
   dtrees that takes the observations into account.
   There is a dtree for each non-observed variable.        */
/*---------------------------------------------------------*/
joint([],_,[]) :- !.
joint([X|Xs],Obs,[dtree(DVars,DTree)|JXs]) :-
   parents(X,PX),
   make_dvars(PX,X,Obs,DVars),
   DVars \== [], !,
   make_dtree(PX,X,Obs,[],DTree),
   joint(Xs,Obs,JXs).
joint([_|Xs],Obs,JXs) :-
   /* we remove any dtree with no variables */
   joint(Xs,Obs,JXs).

/*---------------------------------------------------------*/
/* make_dvars(PX,X,Obs,DVars)  
   where X is a variable and PX are the parents of
   X and Obs is observation list returns
   DVars = {X} U PX - observed variables
   This relies on PX ordered before X                      */
/*---------------------------------------------------------*/
make_dvars([],X,Obs,[]) :-
   member(X=_,Obs), !.
make_dvars([],X,_,[X]).
make_dvars([V|R],X,Obs,DVs) :-
   member(V=_,Obs), !,
   make_dvars(R,X,Obs,DVs).
make_dvars([V|R],X,Obs,[V|DVs]) :-
   /* not member(V=_,Obs), */
   make_dvars(R,X,Obs,DVs).

/*---------------------------------------------------------*/
/* make_dtree(RP,X,Obs,Con,Dtree) constructs a factor
   corresponding to p(X|PX). RP is list of remaining
   parents of X, Obs is the observations, Con is a
   context of assignments to previous (in the
   variable ordering) parents of X - in reverse order
   to the variable assignment, returns DTree as the
   dtree corresponding to values of RP.                    */
/*---------------------------------------------------------*/
make_dtree([],X,Obs,Con,DX) :-
   member(X=OVal,Obs), !,
   reverse(Con,RCon),
   pr(X,RCon,DXPr),
   values(X,Vals),
   select_corresp_elt(Vals,OVal,DXPr,DX).
make_dtree([],X,_,Con,DX) :-
   reverse(Con,RCon),
   pr(X,RCon,DX).
make_dtree([P|RP],X,Obs,Con,DX) :-
   member(P=Val,Obs),!,
   make_dtree(RP,X,Obs,[P=Val|Con],DX).
make_dtree([P|RP],X,Obs,Con,DX) :-
   values(P,Vals),
   make_dtree_for_vals(Vals,P,RP,X,Obs,Con,DX).

/*---------------------------------------------------------*/
/* make_dtree_for_vals(Vals,P,RP,X,Obs,Con,DX).
   makes a DTree for each value in Vals, and
   collected them into DX.  Other variables are as
   for make_dtree.                                         */
/*---------------------------------------------------------*/
make_dtree_for_vals([],_,_,_,_,_,[]) :- !.
make_dtree_for_vals([Val|Vals],P,RP,X,Obs,Con,[ST|DX]):-
   make_dtree(RP,X,Obs,[P=Val|Con],ST),
   make_dtree_for_vals(Vals,P,RP,X,Obs,Con,DX).

/*---------------------------------------------------------*/
/* select_corresp_elt(Vals,Val,List,Elt) is true
   if Elt is at the same position in List as Val is
   in list Vals. Assumes Vals, Val, List are bound.        */
/*---------------------------------------------------------*/
select_corresp_elt([Val|_],Val,[Elt|_],Elt) :-
   !.
select_corresp_elt([_|Vals],Val,[_|Rest],Elt) :-
   select_corresp_elt(Vals,Val,Rest,Elt).

/*---------------------------------------------------------*/
/* sum_out_each(SO,Joint0,Joint1) is true if
   Joint1 is a distribution Joint0 with each
   variable in SO summed out                               */
/*---------------------------------------------------------*/
sum_out_each([],J,J) :- !.
sum_out_each([X|Xs],J0,J2) :-
   sum_out(X,J0,J1),
   sum_out_each(Xs,J1,J2).

/*---------------------------------------------------------*/
/* sum_out_each(V,J0,J1) is true if
   Joint1 is a distribution Joint0 with
   variable V summed out.                                  */
/*---------------------------------------------------------*/
sum_out(X,J0,[dtree(CVars1,CTree)|NoX]) :-
   partition(J0,X,NoX,SomeX),
   variables(AllVars),
   find_tree_vars(SomeX,AllVars,CVars),
   remove(X,CVars,CVars1),
   CVars1 \== [], !,
   create_tree(CVars1,CVars1,SomeX,X,[],CTree).
sum_out(X,J0,NoX) :-
   /* remove any dtrees that have no variables */
   partition(J0,X,NoX,_).

/*---------------------------------------------------------*/
/* partition(J0,X,NoX,SomeX) partitions J0 into
   those dtrees that contain variable X (SomeX) and
   those that do not contain X (NoX)                       */
/*---------------------------------------------------------*/
partition([],_,[],[]) :- !.
partition([dtree(Vs,Di)|R],X,NoX,[dtree(Vs,Di)|SomeX]) :-
   member(X,Vs),
   !,
   partition(R,X,NoX,SomeX).
partition([dtree(Vs,Di)|R],X,[dtree(Vs,Di)|NoX],SomeX) :-
   partition(R,X,NoX,SomeX).

/*---------------------------------------------------------*/
/* find_tree_vars(SomeX,AllVars,CVars) is true
   if CVars is the set of variables that appear in
   some dtree in SomeX, ordered according to AllVars       */
/*---------------------------------------------------------*/
find_tree_vars([],_,[]) :- !.
find_tree_vars([dtree(Vs,_)|RDs],All,Res) :-
    find_tree_vars(RDs,All,Cvars0),
    ordered_union(Vs,Cvars0,Res,All).

/*---------------------------------------------------------*/
/* create_tree(CVars,Vars,SomeX,X,Context,CTree)
   CTree is the tree corresponding to variables CVars.
   The values of the leaves of the tree are obtained
   by multiplying the corresponding values in SomeX.       */
/*---------------------------------------------------------*/
create_tree([],Vars,SomeX,X,Context,Num) :- 
   reverse(Context,CVals),
   values(X,Vals),
   sum_vals(Vals,X,Vars,CVals,SomeX,0,Num), !. /* ??? */
create_tree([Var|CVars],Vars,SomeX,X,Context,CTree) :-
   values(Var,Vals),
   create_tree_vals(Vals,CVars,Vars,SomeX,X,Context,CTree).

/*---------------------------------------------------------*/
/* create_tree_vals(Vals,CVars,Vars,SomeX,X,Context,CTree).
   creates a tree for each value in Vals.                  */
/*---------------------------------------------------------*/
create_tree_vals([],_,_,_,_,_,[]) :- !.
create_tree_vals([Val|Vals],CVars,Vars,
                     SomeX,X,Context,[SubTr|CTree]) :-
   create_tree(CVars,Vars,SomeX,X,[Val|Context],SubTr),
   create_tree_vals(Vals,CVars,Vars,SomeX,X,Context,CTree).

/*---------------------------------------------------------*/
/* sum_vals(Vals,X,Vars,CVals,SomeX,Acc,Sum).
   sums out X in the context Vars=CVals
   Vals is the remaining set of values to be added 
   SomeX is the factors that need to be multiplied         */
/*---------------------------------------------------------*/
sum_vals([],_,_,_,_,S,S) :- !.
sum_vals([Val|Vals],X,Vars,CVals,SomeX,S0,Sum) :-
   mult_vals(SomeX,Val,X,Vars,CVals,1,Prod),
   S1 is S0+Prod,
   sum_vals(Vals,X,Vars,CVals,SomeX,S1,Sum).

/*---------------------------------------------------------*/
/* mult_vals(SomeX,Val,X,Vars,CVals,Acc,Prod),
   computes product of SomeX factors given X=Val,Vars=CVals*/
/*---------------------------------------------------------*/
mult_vals([],_,_,_,_,P,P) :- !.
mult_vals([Tree|SomeX],Val,X,Vars,CVals,P0,Prod) :-
   lookup(X,Val,Vars,CVals,Tree,ContextVal),
   P1 is P0*ContextVal,
   mult_vals(SomeX,Val,X,Vars,CVals,P1,Prod).

/*---------------------------------------------------------*/
/* lookup(Var0,Val0,Vars,Vals,dtree(DVars,DTree),Prob)
   DVars is a subset of Vars U {Var}. Returns
   the value Prob by looking up "Var0=Val0 & Vars=Vals"
   in DTree.  It assumes that the elements of Vars
   and TreeVars are ordered consistently.                  */
/*---------------------------------------------------------*/
lookup(_,_,[],[],dtree([],P),P).
lookup(Var0,Val0,[Var|RVars],[Val|RVals],
           dtree([Var|TVars],DTree),Prob) :-
   !,
   values(Var,Vals),
   select_corresp_elt(Vals,Val,DTree,Subtree),
   lookup(Var0,Val0,RVars,RVals,dtree(TVars,Subtree),Prob).
lookup(Var0,Val0,RVars,RVals,dtree([Var0|TVars],DTree),Prob):-
   !,
   values(Var0,Vals),
   select_corresp_elt(Vals,Val0,DTree,Subtree),
   lookup(Var0,Val0,RVars,RVals,dtree(TVars,Subtree),Prob).
lookup(Var0,Val0,[_|RVars],[_|RVals],DT,Prob) :-
   lookup(Var0,Val0,RVars,RVals,DT,Prob).

/*---------------------------------------------------------*/
/* collectt(Dist,DT) multiplies all of the factors together
   forming a DTRee. This assumes that all of the factors
   contain just the query variable                         */
/*---------------------------------------------------------*/
collectt([dtree(_,DT)],DT) :- !.
collectt([dtree(_,DT0)|R],DT2) :-
   collectt(R ,DT1),
   multiply_corresp_elts(DT0,DT1,DT2).

/*---------------------------------------------------------*/
/* multiply_corresp_elts(DT0,DT1,DT2) DT2 is the dot
   product of DT0 and DT1                                  */
/*---------------------------------------------------------*/
multiply_corresp_elts([],[],[]).
multiply_corresp_elts([E0|L0],[E1|L1],[E2|L2]) :-
   E2 is E0*E1,
   multiply_corresp_elts(L0,L1,L2).

/*---------------------------------------------------------*/
/* normalize(List,CumVal,Sum,NList) makes NList
   the same a list, but where elements sum to 1.
   Sum is the sum of all of the list, and CumVal
   is the accumulated sum to this point.                   */
/*---------------------------------------------------------*/
normalize([],S,S,[]).
normalize([A|L],CV,Sum,[AN|LN]) :-
   CV1 is CV + A,
   normalize(L,CV1,Sum,LN),
   AN is A/Sum.

/*---------------------------------------------------------*/
/* ordered_union(L0,L1,R,RL) is true if R = L0 U L1, where RL
   is a reference list that provides the ordering of elements.
   L0, L1, RL must all be bound.                           */
/*---------------------------------------------------------*/
ordered_union([],L,L,_) :- !.
ordered_union(L,[],L,_) :- !.
ordered_union([E|L0],[E|L1],[E|R],[E|RL]) :- 
   !,
   ordered_union(L0,L1,R,RL).
ordered_union([E|L0],L1,[E|R],[E|RL]) :- 
   !,
   ordered_union(L0,L1,R,RL).
ordered_union(L0,[E|L1],[E|R],[E|RL]) :- 
   !,
   ordered_union(L0,L1,R,RL).
ordered_union(L0,L1,R,[_|RL]) :- 
   !,
   ordered_union(L0,L1,R,RL).

/*---------------------------------------------------------*/
/* STANDARD DEFINITIONS                                    */
/*---------------------------------------------------------*/
/* reverse(L,R) is true if R contains same elements 
   as list L, in reverse order                             */
/*---------------------------------------------------------*/
reverse(L,R) :-
   rev(L,[],R).
rev([],R,R).
rev([H|T],Acc,R) :-
   rev(T,[H|Acc],R).

/*---------------------------------------------------------*/
/* remove(E,L,R) true if R is the list L with 
   one occurrence of E removed                             */
/*---------------------------------------------------------*/
remove(E,[E|L],L).
remove(E,[A|L],[A|R]) :-
   remove(E,L,R).

/*---------------------------------------------------------*/
/* remove_if_present(E,L,R) true if R is the list
   L with one occurrence of E removed                      */
/*---------------------------------------------------------*/
remove_if_present(_,[],[]).
remove_if_present(E,[E|L],L) :- !.
remove_if_present(E,[A|L],[A|R]) :-
   remove_if_present(E,L,R).

/*---------------------------------------------------------*/
/*  Induction of Decision Trees                            */
/*---------------------------------------------------------*/
/*---------------------------------------------------------*/
/* Example format: example(ID, Class, [A=V,...]).          */
/*---------------------------------------------------------*/
/* Converting document vectors into the example format:    */
/*   id3format(Vectors,IDF) - converts and stores          */
/*      examples in the Prolog database                    */
/* Use:                                                    */
/*   Create a decision tree: ?- id3.                       */
/*   Print tree:             ?- showtree.                  */
/*   Print rules:            ?- listing(if).               */
/*---------------------------------------------------------*/   

/*---------------------------------------------------------*/
?-  op(100,fx,if).
?-  op(99,xfy,then).
/*---------------------------------------------------------*/

id3format(Vectors,IDF) :-
    retractall(example(_,_,_)),
    convert(Vectors,IDF).

convert([],_) :- !.
convert([ID-Class-E|L],IDF) :-
    add_terms(IDF,E,TE),
    assertz(example(ID,Class,TE)),
    convert(L,IDF).

add_terms([],[],[]) :- !.
add_terms([_-T|L1],[V|L2],[T=V|L3]) :-
   add_terms(L1,L2,L3).

id3(Tr) :-
    retractall(node(_,_,_)),
    retractall(if _ then _),
    findall(N,example(N,_,_),E),
    example(_,_,L), !,
    get_attributes(L,A),
    idt(E,root,A,Tr),
    assert_rules, !.

idt(E,Parent,_,Tr) :-
    length(E,Len),
    Len=<Tr,
    distr(E, Distr),
    assertz(node(leaf,Distr,Parent)), !.
idt(E,Parent,_,_) :-
    distr(E, [C]),
    assertz(node(leaf,[C],Parent)).
idt(Es,Parent,As,Tr) :- 
    choose_attribute(Es,As,A,Values,Rest), !,
    partition(Values,A,Es,Parent,Rest,Tr).
idt(E,Parent,_,_) :- !,
    node(Parent,Test,_),
    write('Inconsistent data: cannot split '), write(E), write(' at node '), writeln(Test).

get_attributes([],[]) :- !.
get_attributes([A=_|T],[A|W]) :-
    get_attributes(T,W).

partition([],_,_,_,_,_) :- !.
partition([V|Vs],A,Es,Parent,Rest,Tr) :-
    get_subset(Es,A=V,Ei), !,
    gen_name(Node), 
    assertz(node(Node,A=V,Parent)),
    idt(Ei,Node,Rest,Tr), !,
    partition(Vs,A,Es,Parent,Rest,Tr).

choose_attribute(Es,As,A,Values,Rest) :-
    length(Es,LenEs),
    information_content(Es,LenEs,I), !,
    findall((A-Values)/Gain, 
            (member(A,As),
             get_values(Es,A,[],Values),
             split_into_subsets(Values,Es,A,Ess),
             residual_information(Ess,LenEs,R),
             Gain is I - R),
            All),
    maximum(All,(A-Values)/_),
    efface(A,As,Rest), !.

split_into_subsets([],_,_,[]) :- !.
split_into_subsets([V|Vs],Es,A,[Ei|Rest]) :-
    get_subset(Es,A=V,Ei), !,
    split_into_subsets(Vs,Es,A,Rest).

residual_information([],_,0) :- !.
residual_information([Ei|Es],Len,Res) :-
    length(Ei,LenEi),
    information_content(Ei,LenEi,I), !,
    residual_information(Es,Len,R),
    Res is R + I*LenEi/Len.

information_content(Es,Len,I) :-
    setof(C,E^L^(member(E,Es),example(E,C,L)),Classes), !,
    sum_terms(Classes,Es,Len,I).

sum_terms([],_,_,0) :- !.
sum_terms([C|Cs],Es,Len,Info) :-
    findall(E,(member(E,Es),example(E,C,_)),InC),
    length(InC,N),
    sum_terms(Cs,Es,Len,I),
    Info is I - (N/Len)*(log(N/Len)/log(2)).

get_values([],_,Values,Values) :- !.
get_values([E|Es],A,Vs,Values) :-
    example(E,_,L),
    member(A=V,L), !,
    (member(V,Vs), !, get_values(Es,A,Vs,Values);
     get_values(Es,A,[V|Vs],Values)
    ).

get_subset([],_,[]) :- !.
get_subset([E|Es],A,[E|W]) :-
    example(E,_,L),
    member(A,L), !,
    get_subset(Es,A,W).
get_subset([_|Es],A,W) :-
    get_subset(Es,A,W).

assert_rules :-
    path(root,Path,Conclusion),
    assertz(if Path then Conclusion),
    fail.
assert_rules.

path(Parent,[],Class) :-
    node(leaf,Class,Parent), !.
path(Parent,[A|Path],Leaf) :-
    node(Son,A,Parent),
    path(Son,Path,Leaf).

distr(S,Dist) :-
    setof(C,X^L^(member(X,S),example(X,C,L)),Cs),
    countc(Cs,S,Dist).

countc([],_,[]) :- !.
countc([C|L],E,[C/N|T]) :-
    findall(X,(member(X,E),example(X,C,_)),W),
    length(W,N), !,
    countc(L,E,T).

/*--------------------- Show tree ---------------------------*/
showtree :-
    showtree(root,0).

showtree(Parent,_) :- 
    node(leaf,Class,Parent), !,
    write(' => '),write(Class).
showtree(Parent,Pos) :-
    findall(Son,node(Son,_,Parent),L),
    Pos1 is Pos+2,
    show_list(L,Pos1).

show_list([],_) :- !.
show_list([N|T],Pos) :-
    node(N,Label,_),
    nl, tab(Pos), write(Label),
    showtree(N,Pos),
    show_list(T,Pos).


/*-----------------------------------------------------------*/
/*                       Utilities                           */
/*-----------------------------------------------------------*/

corpus([],[]) :- !.
corpus([F|T],W) :-
    see(F),
    tokenize(L),
    seen,
    corpus(T,V),
    append(L,V,W).

firstn(0,_,[]) :- !.
firstn(_,[],[]) :- !.
firstn(N,[X|T],[X|V]) :-
    M is N-1,
    firstn(M,T,V).

firstn1(0,_,[]) :- !.
firstn1(_,[],[]) :- !.
firstn1(N,[_-X|T],[X|V]) :-
    M is N-1,
    firstn1(M,T,V).

wfreq(L,RSF) :-
    wf(L,F),
    keysort(F,SF),
    reverse(SF,RSF).

wf([],[]) :- !.
wf([X|T],[C-X|L]) :- 
    count([X|T],X,R,C), !,
    wf(R,L).

update_idf([],_,[]) :- !.
update_idf([N-W|T],WL,[M-W|V]) :-
    memberchk(W,WL), !,
    M is N+1,
    update_idf(T,WL,V).
update_idf([W|T],WL,[1-W|V]) :-
    memberchk(W,WL), !,
    update_idf(T,WL,V).
update_idf([W|T],WL,[W|V]) :-
    update_idf(T,WL,V).

calc_idf([],_,[]) :- !.
calc_idf([N-W|T],D,[IDF-W|V]) :-
    IDF is log((1+D)/N),
    calc_idf(T,D,V).

binvalues([],_,[]) :- !.
binvalues([_-W|T],WL,[1|L]) :-
    memberchk(W,WL), !,
    binvalues(T,WL,L).
binvalues([_|T],WL,[0|L]) :-
    binvalues(T,WL,L).

termvalues([],_,[]) :- !.
termvalues([IDF-W|T],WL,[V|L]) :-
    count(WL,W,_,N), !,
    length(WL,M),
    V is (N/M)*IDF,       % Not Cornell SMART
    termvalues(T,WL,L).

norm([],0) :- !.
norm([V|T],N) :-
    norm(T,N1),
    N is V*V+N1.

normalize(V,0,V) :- !.
normalize([],_,[]) :- !.
normalize([V|T],N,[NV|L]) :-
    NV is sqrt(V*V/N),
    normalize(T,N,L).

sum([[X|T]],[X|T]) :- !.
sum([[X|T]|L],S) :-
   sum(L,S1),
   sum2([X|T],S1,S).

sum2([],[],[]) :- !.
sum2([X|T],[Y|U],[Z|V]) :-
   Z is X+Y,
   sum2(T,U,V).

divide([],_,[]) :- !.
divide([V|T],D,[DV|L]) :-
    DV is V/D,
    divide(T,D,L).

max([X],X) :- !.
max([M-X|T],N-Y) :- 
    max(T,K-Z), 
    (M>K,N-Y=M-X; N-Y=K-Z), !.

maximum([X],X) :- !.
maximum([X/M|T],Y/N) :-
    maximum(T,Z/K),
    (M>K,Y/N=X/M ; Y/N=Z/K), !.

pair([A|L],A,B) :- 
    member(B,L).
pair([_|L],A,B) :- 
    pair(L,A,B).

del(X,[X|T],T).
del(X,[Y|T],[Y|L]) :-
    del(X,T,L).

subset([],_) :- !.
subset([X|T],L) :-
    member(X,L), !,
    subset(T,L).

intersection([],_,[]) :- !.
intersection([X|T],L,[X|V]) :-
    member(X,L), !,
    intersection(T,L,V).
intersection([_|T],L,V) :-
    intersection(T,L,V).

count([],_,[],0) :- !.
count([X|T],X,R,N) :- !, 
    count(T,X,R,M),
    N is M+1.
count([X|T],Y,[X|R],N) :- !, 
    count(T,Y,R,N).

ppl([]) :- !.
ppl([X|T]) :-
    writeln(X),
    ppl(T).

gen_name(M) :-
   retract(nam(N)),
   M is N+1,
   assert(nam(M)), !.
gen_name(1) :-
   assert(nam(1)).

efface(X,[X|T],T) :- !.
efface(X,[Y|T],[Y|Z]) :-
   efface(X,T,Z).


/*-------------- Reading and tokenizing text ----------------*/

tokenize([]) :- 
    peek_byte(-1), !.
tokenize(T) :- 
    peek_byte(C), 
    \+ charclass(C,_), !,
    get0(_),
    tokenize(T).
tokenize(R) :- 
    getword(L),
    name(W,L),
    (stopword(W),R=T; R=[W|T]), !,
    tokenize(T).

getword([X|T]) :-
    get0(C), charclass(C,X), !,
    getword(T).
getword([]).

/*-----------------------------------------------------------*/
/*      Character classes accepted by tokenize               */
/*-----------------------------------------------------------*/

charclass(C,C) :- C>47, C<58 , !.            % digit
charclass(C,C) :- C>96, C<123 , !.           % lower case
charclass(C,U) :- C>64, C<91 , U is C+32, !. % upper case

/*-----------------------------------------------------------*/
/*      Stopwords (words ignored by tokenize)                */
/*-----------------------------------------------------------*/

stopword(W) :- name(W,[_]), !.
stopword(a).
stopword(about).
stopword(above).
stopword(across).
stopword(after).
stopword(afterwards).
stopword(again).
stopword(against).
stopword(all).
stopword(almost).
stopword(alone).
stopword(along).
stopword(already).
stopword(also).
stopword(although).
stopword(always).
stopword(am).
stopword(among).
stopword(amongst).
stopword(amoungst).
stopword(amount).
stopword(an).
stopword(and).
stopword(another).
stopword(any).
stopword(anyhow).
stopword(anyone).
stopword(anything).
stopword(anyway).
stopword(anywhere).
stopword(are).
stopword(around).
stopword(as).
stopword(at).
stopword(back).
stopword(be).
stopword(became).
stopword(because).
stopword(become).
stopword(becomes).
stopword(becoming).
stopword(been).
stopword(before).
stopword(beforehand).
stopword(behind).
stopword(being).
stopword(below).
stopword(beside).
stopword(besides).
stopword(between).
stopword(beyond).
stopword(bill).
stopword(both).
stopword(bottom).
stopword(but).
stopword(by).
stopword(call).
stopword(can).
stopword(cannot).
stopword(cant).
stopword(co).
stopword(con).
stopword(could).
stopword(couldnt).
stopword(cry).
stopword(de).
stopword(describe).
stopword(detail).
stopword(do).
stopword(done).
stopword(down).
stopword(due).
stopword(during).
stopword(each).
stopword(eg).
stopword(eight).
stopword(either).
stopword(eleven).
stopword(else).
stopword(elsewhere).
stopword(empty).
stopword(enough).
stopword(etc).
stopword(even).
stopword(ever).
stopword(every).
stopword(everyone).
stopword(everything).
stopword(everywhere).
stopword(except).
stopword(few).
stopword(fifteen).
stopword(fify).
stopword(fill).
stopword(find).
stopword(fire).
stopword(first).
stopword(five).
stopword(for).
stopword(former).
stopword(formerly).
stopword(forty).
stopword(found).
stopword(four).
stopword(from).
stopword(front).
stopword(full).
stopword(further).
stopword(get).
stopword(give).
stopword(go).
stopword(had).
stopword(has).
stopword(hasnt).
stopword(have).
stopword(he).
stopword(hence).
stopword(her).
stopword(here).
stopword(hereafter).
stopword(hereby).
stopword(herein).
stopword(hereupon).
stopword(hers).
stopword(herself).
stopword(him).
stopword(himself).
stopword(his).
stopword(how).
stopword(however).
stopword(hundred).
stopword(i).
stopword(ie).
stopword(if).
stopword(in).
stopword(inc).
stopword(indeed).
stopword(interest).
stopword(into).
stopword(is).
stopword(it).
stopword(its).
stopword(itself).
stopword(keep).
stopword(last).
stopword(latter).
stopword(latterly).
stopword(least).
stopword(less).
stopword(ltd).
stopword(made).
stopword(many).
stopword(may).
stopword(me).
stopword(meanwhile).
stopword(might).
stopword(mill).
stopword(mine).
stopword(more).
stopword(moreover).
stopword(most).
stopword(mostly).
stopword(move).
stopword(much).
stopword(must).
stopword(my).
stopword(myself).
stopword(name).
stopword(namely).
stopword(nearby).
stopword(neither).
stopword(never).
stopword(nevertheless).
stopword(next).
stopword(nine).
stopword(no).
stopword(nobody).
stopword(none).
stopword(noone).
stopword(nor).
stopword(not).
stopword(nothing).
stopword(now).
stopword(nowhere).
stopword(of).
stopword(off).
stopword(often).
stopword(on).
stopword(once).
stopword(one).
stopword(only).
stopword(onto).
stopword(or).
stopword(other).
stopword(others).
stopword(otherwise).
stopword(our).
stopword(ours).
stopword(ourselves).
stopword(out).
stopword(over).
stopword(own).
stopword(part).
stopword(per).
stopword(perhaps).
stopword(please).
stopword(put).
stopword(rather).
stopword(re).
stopword(same).
stopword(see).
stopword(seem).
stopword(seemed).
stopword(seeming).
stopword(seems).
stopword(serious).
stopword(several).
stopword(she).
stopword(should).
stopword(show).
stopword(side).
stopword(since).
stopword(sincere).
stopword(six).
stopword(sixty).
stopword(so).
stopword(some).
stopword(somehow).
stopword(someone).
stopword(something).
stopword(sometime).
stopword(sometimes).
stopword(somewhere).
stopword(still).
stopword(such).
stopword(take).
stopword(ten).
stopword(than).
stopword(that).
stopword(the).
stopword(their).
stopword(them).
stopword(themselves).
stopword(then).
stopword(thence).
stopword(there).
stopword(thereafter).
stopword(thereby).
stopword(therefore).
stopword(therein).
stopword(thereupon).
stopword(these).
stopword(they).
stopword(thick).
stopword(thin).
stopword(third).
stopword(this).
stopword(those).
stopword(though).
stopword(three).
stopword(through).
stopword(throughout).
stopword(thru).
stopword(thus).
stopword(to).
stopword(together).
stopword(too).
stopword(top).
stopword(toward).
stopword(towards).
stopword(twelve).
stopword(twenty).
stopword(two).
stopword(un).
stopword(under).
stopword(until).
stopword(up).
stopword(upon).
stopword(us).
stopword(very).
stopword(via).
stopword(was).
stopword(we).
stopword(well).
stopword(were).
stopword(what).
stopword(whatever).
stopword(when).
stopword(whence).
stopword(whenever).
stopword(where).
stopword(whereafter).
stopword(whereas).
stopword(whereby).
stopword(wherein).
stopword(whereupon).
stopword(wherever).
stopword(whether).
stopword(which).
stopword(while).
stopword(whither).
stopword(who).
stopword(whoever).
stopword(whole).
stopword(whom).
stopword(whose).
stopword(why).
stopword(will).
stopword(with).
stopword(within).
stopword(without).
stopword(would).
stopword(yet).
stopword(you).
stopword(your).
stopword(yours).
stopword(yourself).
stopword(yourselves).