/*-----------------------------------------------------------*/ /* Text Mining Primitives (vector-space model, clustering) */ /*-----------------------------------------------------------*/ /* (C) 2004 Zdravko Markov */ /*-----------------------------------------------------------*/ /*-----------------------------------------------------------*/ /* Vector Space Model */ /*-----------------------------------------------------------*/ /*-----------------------------------------------------------*/ /* vectors(+Files,+IDF,-Vectors) */ /* Generates a TFIDF vector for each document in Files */ /*-----------------------------------------------------------*/ /* binvectors(+Files,+IDF,-Vectors) */ /* Generates a binary vector for each document in Files */ /*-----------------------------------------------------------*/ vectors([],_,[]) :- !. vectors([F-C|T],IDF,[F-C-V|L]) :- !, vector(F,IDF,V), !, vectors(T,IDF,L). vectors([F|T],IDF,[F-V|L]) :- vector(F,IDF,V), !, vectors(T,IDF,L). vector(File,IDF,NVector) :- see(File), tokenize(WL), seen, termvalues(IDF,WL,Vector), norm(Vector,Norm), !, (Norm \== 0, normalize(Vector,Norm,NVector); NVector = Vector), !. binvectors([],_,[]) :- !. binvectors([F-C|T],IDF,[F-C-V|L]) :- !, binvector(F,IDF,V), !, binvectors(T,IDF,L). binvectors([F|T],IDF,[F-V|L]) :- binvector(F,IDF,V), !, binvectors(T,IDF,L). binvector(File,IDF,Vector) :- see(File), tokenize(WL), seen, binvalues(IDF,WL,Vector). /*-----------------------------------------------------------*/ /* idf(+Files,+Terms,+N,-IDFN) */ /* Returns the top N terms according to their IDF in Files */ /*-----------------------------------------------------------*/ idf(Files,Terms,N,IDFN) :- idfcount(Files,Terms,IDF1), !, keysort(IDF1,IDF2), length(Files,D), calc_idf(IDF2,D,IDF), firstn(N,IDF,IDFN). idfcount([],IDF,IDF) :- !. idfcount([F|T],IDF,IDF2) :- see(F), tokenize(WL), seen, update_idf(IDF,WL,IDF1), idfcount(T,IDF1,IDF2). /*-----------------------------------------------------------*/ /* tf(+Files,+N,-Terms) */ /* Returns the most frequent N terms in Files */ /*-----------------------------------------------------------*/ tf(Files,N,Terms) :- corpus(Files,WL), wfreq(WL,FL), firstn1(N,FL,Terms). /*-----------------------------------------------------------*/ /* class(+Files,+Classes,-LabeledFiles) */ /* Labels files with classes */ /*-----------------------------------------------------------*/ class([],_,[]) :- !. class([X|T],CL,[X-C|V]) :- member(C-L,CL), memberchk(X,L), !, class(T,CL,V). /*-----------------------------------------------------------*/ /* arff(+Terms,+Vectors,-File) */ /* Converts Vectors into WEKA ARFF format */ /*-----------------------------------------------------------*/ arff(Terms,Vectors,File) :- tell(File), write('@relation '),writeln(File),nl, writeln('@attribute id string'), printattr(Terms), printclassattr(Vectors),nl, writeln('@data'), printdata(Vectors), told. printattr([]) :- !. printattr([_-A|T]) :- write('@attribute '),write(A),tab(1),writeln('numeric'), printattr(T). printclassattr([X-C-D|T]) :- !, setof(Cl,A^B^member(A-Cl-B,[X-C-D|T]),Cs), write('@attribute class '), write('{'), printrow(Cs), writeln('}'). printclassattr(_). printdata([]) :- !. printdata([X-C-D|T]) :- !, printrow([X|D]),write(','),writeln(C), printdata(T). printdata([X-D|T]) :- !, printrow([X|D]),nl, printdata(T). printrow([X]) :- write(X), !. printrow([X|T]) :- write(X),write(','), printrow(T). /*-----------------------------------------------------------*/ /* Agglomerative Hierarchical Clustering */ /*-----------------------------------------------------------*/ /*-----------------------------------------------------------*/ /* cluster(+E, -Quality) */ /* E = [x1-[v11,v12,...],x2-[v21,v22,...],...] */ /* Quality is the average similarity between all clusters */ /*-----------------------------------------------------------*/ cluster(E,Quality) :- cluster(E,Clustering,Quality), show(Clustering). cluster(E,Clustering,Quality) :- cluster(E,Clustering,D,N), Quality is D/N. cluster(E,Clustering,Q,N) :- findall([A,B]/D,(pair(E,A,B),similarity(A,B,D)),All), maximum(All,[A-X,B-Y]/D), del(A-X,E,E1), del(B-Y,E1,E2), centroid([X,Y],C), cluster([[A,B]-C|E2],Clustering,Q1,N1), !, Q is Q1 + D, N is N1 + 1. cluster([C-_],C,0,0). similarity(_-X,_-Y,D) :- dot(X,Y,D), !. centroid(L,C) :- sum(L,S), length(L,N), divide(S,N,C1), norm(C1,Norm), normalize(C1,Norm,C). dot([],[],0) :- !. dot([X|T],[Y|V],D) :- dot(T,V,P), D is P+X*Y. /*-----------------------------------------------------------*/ /* show(+Clustering) */ /* Prints clustering (nested list) as a horizontal tree */ /*-----------------------------------------------------------*/ show(C) :- show(C,0). show([[H|T]],P) :- !, show([H|T],P). show([X|Y],P) :- (distribution([X|Y],D),entropy([X|Y],H),Node=D-H; Node = +), !, % use entropy if labeled tab(P), writeln(Node), P1 is P+2, show(X,P1), showl(Y,P1). show(X,P) :- tab(P),!, writeln(X). showl([],_) :- !. showl([X|T],P) :- show(X,P), showl(T,P). /*-----------------------------------------------------------*/ /* entropy(+Clustering,-Entropy) */ /* Computes entropy in a labeled clustering */ /*-----------------------------------------------------------*/ entropy(L,I) :- flatten(L,FL), distribution(FL,D), length(FL,N), sumlogs(D,N,I). sumlogs([],_,0) :- !. sumlogs([X-_|T],N,S) :- sumlogs(T,N,S1), S is S1-(X/N)*log(X/N)/log(2). /*-----------------------------------------------------------*/ /* distribution(+Clustering,-Distribution) */ /* Computes the distribution of class labels in Clustering */ /*-----------------------------------------------------------*/ distribution(L,D) :- flatten(L,FL), setof(C,X^member(X-C,FL),CL), distr(CL,FL,D). distr([],_,[]) :- !. distr([C|T],L,[D-C|V]) :- findall(C,X^member(X-C,L),W), length(W,D), !, distr(T,L,V). /*-----------------------------------------------------------*/ /* K-Nearest Neighbor Algorithm */ /*-----------------------------------------------------------*/ /* knn: K-nearest neighbor */ /* knnw: Distance-weighted nearest neighbor */ /*-----------------------------------------------------------*/ knn(X,K,Examples,Class) :- neighbors(X,K,Examples,Neighbors), sumv(Neighbors,Sum), max(Sum,_-Class). knnw(X,K,Examples,Class) :- neighbors(X,K,Examples,Neighbors), sumw(Neighbors,Sum), max(Sum,_-Class). neighbors(X,K,Examples,Neighbors) :- findall(S-C,(member(Id-C-E,Examples),similarity(X,Id-C-E,S)),Ss), keysort(Ss,L), reverse(L,RL), firstn(K,RL,Neighbors). sumv([],[]). sumv([_-C|T],[N-C|R]) :- delc(C,T,V,N), sumv(V,R). delc(_,[],[],1). delc(X,[_-X|T],V,N) :- !, delc(X,T,V,M), N is M+1. delc(X,[Y|T],[Y|V],N) :- delc(X,T,V,N). sumw([],[]). sumw([D-C|T],[S-C|R]) :- delcw(C,T,V,S1), S is S1+D, sumw(V,R). delcw(_,[],[],0). delcw(X,[D-X|T],V,S) :- !, delcw(X,T,V,S1), S is S1+D. delcw(X,[Y|T],[Y|V],N) :- delcw(X,T,V,N). /*-----------------------------------------------------------*/ /* Naive Bayes Classifier (discrete values) */ /*-----------------------------------------------------------*/ /* Call: bayes(+TestData, +Examples, -Classification). */ /*-----------------------------------------------------------*/ bayes(X,Examples,Class) :- probs(X,Examples,Probs), max(Probs,_-Class). probs(X,Examples,Probs) :- setof(C,N^E^member(N-C-E,Examples),Cs), findall(P-C,(member(C,Cs), cond_prob(X,C,Examples,PL), class_prob(C,Examples,PC), mult(PL,PS), P is PS*PC),Probs). cond_prob(X,C,Examples,PL) :- cond_prob(1,X,C,Examples,PL). cond_prob(_,[],_,_,[]) :- !. cond_prob(I,[X|T],C,Examples,[P|PL]) :- findall(V,(member(_-C-E,Examples),nth1(I,E,V)),L), length(L,N), findall(X,member(X,L),O), length(O,F), P is F/N, J is I+1, cond_prob(J,T,C,Examples,PL). class_prob(C,Examples,PC) :- findall(I,member(I-_,Examples),All), length(All,N), findall(I,member(I-C-_,Examples),W), length(W,M), PC is M/N. mult([],1) :- !. mult([P|L],Ps) :- mult(L,P1), Ps is P1*P. /*---------------------------------------------------------*/ /* BELIEF NETWORK INTERPRETER */ /* Computational Intelligence: a logical approach. */ /* Copyright (c) 1998, Poole, Mackworth, Goebel */ /* and Oxford University Press. */ /*---------------------------------------------------------*/ /* A belief network is represented with the relations variables(Xs) Xs is the list of random variables. Xs is ordered: parents of node are before the node. parents(X,Ps) Ps list of parents of variable X. Ps is ordered consistently with Xs values(X,Vs) Vs is the list of values of X pr(X,As,D) X is a variable, As is a list of Pi=Vi where Pi is a parent of X, and Vi is a value for variable Pi The elements of As are ordered consistently with Ps. */ /*---------------------------------------------------------*/ /*---------------------------------------------------------*/ /* p(Var,Obs,Dist) is true if Dist represents the probability distribution of P(Var|Obs) where Obs is a list of Vari=Vali. Var is not observed. */ /*---------------------------------------------------------*/ p(Var,Obs,VDist) :- relevant(Var,Obs,RelVars), to_sum_out(RelVars,Var,Obs,SO), joint(RelVars,Obs,Joint), sum_out_each(SO,Joint,Dist), collectt(Dist,DT0), normalize(DT0,0,_,VDist). /*---------------------------------------------------------*/ /* relevant(Var,Obs,RelVars) Relvars is the relevant variables given query Var and observations Obs. This is the most conservative. */ /*---------------------------------------------------------*/ relevant(_,_,Vs) :- variables(Vs). /*---------------------------------------------------------*/ /* to_sum_out(Vs,Var,Obs,SO), Given all variables Vs, query variable Var and observations Obs, S0 specifies the elimination ordering. Here, naively, the elimination ordering is the same as variable ordering */ /*---------------------------------------------------------*/ to_sum_out(Vs,Var,Obs,SO) :- remove(Var,Vs,RVs), remove_each_obs(Obs,RVs,SO). /*---------------------------------------------------------*/ /* remove_each_obs(Obs,RVs,SO) removes each of the observation variables from RVs resulting in SO. */ /*---------------------------------------------------------*/ remove_each_obs([],SO,SO) :- !. remove_each_obs([X=_|Os],Vs0,SO) :- remove_if_present(X,Vs0,Vs1), remove_each_obs(Os,Vs1,SO). /*---------------------------------------------------------*/ /* A joint probability distribution is represented as a list of distribution trees, of the form dtree(Vars,DTree) where Vars is a list of Variables (ordered consistently with the ordering of variables), and DTree is tree representation for the function from values of variables into numbers such that if Vars=[] then DTree is a number. Otherwise Vars=[Var|RVars], and DTree is a list with one element for each value of Var, and each element is a tree representation for RVars. The ordering of the elements in DTree is given by the ordering of Vals given by values(Var,Vals). */ /*---------------------------------------------------------*/ /* joint(Vs,Obs,Joint) Vs is a list of variables, Obs is an observation list returns a list of dtrees that takes the observations into account. There is a dtree for each non-observed variable. */ /*---------------------------------------------------------*/ joint([],_,[]) :- !. joint([X|Xs],Obs,[dtree(DVars,DTree)|JXs]) :- parents(X,PX), make_dvars(PX,X,Obs,DVars), DVars \== [], !, make_dtree(PX,X,Obs,[],DTree), joint(Xs,Obs,JXs). joint([_|Xs],Obs,JXs) :- /* we remove any dtree with no variables */ joint(Xs,Obs,JXs). /*---------------------------------------------------------*/ /* make_dvars(PX,X,Obs,DVars) where X is a variable and PX are the parents of X and Obs is observation list returns DVars = {X} U PX - observed variables This relies on PX ordered before X */ /*---------------------------------------------------------*/ make_dvars([],X,Obs,[]) :- member(X=_,Obs), !. make_dvars([],X,_,[X]). make_dvars([V|R],X,Obs,DVs) :- member(V=_,Obs), !, make_dvars(R,X,Obs,DVs). make_dvars([V|R],X,Obs,[V|DVs]) :- /* not member(V=_,Obs), */ make_dvars(R,X,Obs,DVs). /*---------------------------------------------------------*/ /* make_dtree(RP,X,Obs,Con,Dtree) constructs a factor corresponding to p(X|PX). RP is list of remaining parents of X, Obs is the observations, Con is a context of assignments to previous (in the variable ordering) parents of X - in reverse order to the variable assignment, returns DTree as the dtree corresponding to values of RP. */ /*---------------------------------------------------------*/ make_dtree([],X,Obs,Con,DX) :- member(X=OVal,Obs), !, reverse(Con,RCon), pr(X,RCon,DXPr), values(X,Vals), select_corresp_elt(Vals,OVal,DXPr,DX). make_dtree([],X,_,Con,DX) :- reverse(Con,RCon), pr(X,RCon,DX). make_dtree([P|RP],X,Obs,Con,DX) :- member(P=Val,Obs),!, make_dtree(RP,X,Obs,[P=Val|Con],DX). make_dtree([P|RP],X,Obs,Con,DX) :- values(P,Vals), make_dtree_for_vals(Vals,P,RP,X,Obs,Con,DX). /*---------------------------------------------------------*/ /* make_dtree_for_vals(Vals,P,RP,X,Obs,Con,DX). makes a DTree for each value in Vals, and collected them into DX. Other variables are as for make_dtree. */ /*---------------------------------------------------------*/ make_dtree_for_vals([],_,_,_,_,_,[]) :- !. make_dtree_for_vals([Val|Vals],P,RP,X,Obs,Con,[ST|DX]):- make_dtree(RP,X,Obs,[P=Val|Con],ST), make_dtree_for_vals(Vals,P,RP,X,Obs,Con,DX). /*---------------------------------------------------------*/ /* select_corresp_elt(Vals,Val,List,Elt) is true if Elt is at the same position in List as Val is in list Vals. Assumes Vals, Val, List are bound. */ /*---------------------------------------------------------*/ select_corresp_elt([Val|_],Val,[Elt|_],Elt) :- !. select_corresp_elt([_|Vals],Val,[_|Rest],Elt) :- select_corresp_elt(Vals,Val,Rest,Elt). /*---------------------------------------------------------*/ /* sum_out_each(SO,Joint0,Joint1) is true if Joint1 is a distribution Joint0 with each variable in SO summed out */ /*---------------------------------------------------------*/ sum_out_each([],J,J) :- !. sum_out_each([X|Xs],J0,J2) :- sum_out(X,J0,J1), sum_out_each(Xs,J1,J2). /*---------------------------------------------------------*/ /* sum_out_each(V,J0,J1) is true if Joint1 is a distribution Joint0 with variable V summed out. */ /*---------------------------------------------------------*/ sum_out(X,J0,[dtree(CVars1,CTree)|NoX]) :- partition(J0,X,NoX,SomeX), variables(AllVars), find_tree_vars(SomeX,AllVars,CVars), remove(X,CVars,CVars1), CVars1 \== [], !, create_tree(CVars1,CVars1,SomeX,X,[],CTree). sum_out(X,J0,NoX) :- /* remove any dtrees that have no variables */ partition(J0,X,NoX,_). /*---------------------------------------------------------*/ /* partition(J0,X,NoX,SomeX) partitions J0 into those dtrees that contain variable X (SomeX) and those that do not contain X (NoX) */ /*---------------------------------------------------------*/ partition([],_,[],[]) :- !. partition([dtree(Vs,Di)|R],X,NoX,[dtree(Vs,Di)|SomeX]) :- member(X,Vs), !, partition(R,X,NoX,SomeX). partition([dtree(Vs,Di)|R],X,[dtree(Vs,Di)|NoX],SomeX) :- partition(R,X,NoX,SomeX). /*---------------------------------------------------------*/ /* find_tree_vars(SomeX,AllVars,CVars) is true if CVars is the set of variables that appear in some dtree in SomeX, ordered according to AllVars */ /*---------------------------------------------------------*/ find_tree_vars([],_,[]) :- !. find_tree_vars([dtree(Vs,_)|RDs],All,Res) :- find_tree_vars(RDs,All,Cvars0), ordered_union(Vs,Cvars0,Res,All). /*---------------------------------------------------------*/ /* create_tree(CVars,Vars,SomeX,X,Context,CTree) CTree is the tree corresponding to variables CVars. The values of the leaves of the tree are obtained by multiplying the corresponding values in SomeX. */ /*---------------------------------------------------------*/ create_tree([],Vars,SomeX,X,Context,Num) :- reverse(Context,CVals), values(X,Vals), sum_vals(Vals,X,Vars,CVals,SomeX,0,Num), !. /* ??? */ create_tree([Var|CVars],Vars,SomeX,X,Context,CTree) :- values(Var,Vals), create_tree_vals(Vals,CVars,Vars,SomeX,X,Context,CTree). /*---------------------------------------------------------*/ /* create_tree_vals(Vals,CVars,Vars,SomeX,X,Context,CTree). creates a tree for each value in Vals. */ /*---------------------------------------------------------*/ create_tree_vals([],_,_,_,_,_,[]) :- !. create_tree_vals([Val|Vals],CVars,Vars, SomeX,X,Context,[SubTr|CTree]) :- create_tree(CVars,Vars,SomeX,X,[Val|Context],SubTr), create_tree_vals(Vals,CVars,Vars,SomeX,X,Context,CTree). /*---------------------------------------------------------*/ /* sum_vals(Vals,X,Vars,CVals,SomeX,Acc,Sum). sums out X in the context Vars=CVals Vals is the remaining set of values to be added SomeX is the factors that need to be multiplied */ /*---------------------------------------------------------*/ sum_vals([],_,_,_,_,S,S) :- !. sum_vals([Val|Vals],X,Vars,CVals,SomeX,S0,Sum) :- mult_vals(SomeX,Val,X,Vars,CVals,1,Prod), S1 is S0+Prod, sum_vals(Vals,X,Vars,CVals,SomeX,S1,Sum). /*---------------------------------------------------------*/ /* mult_vals(SomeX,Val,X,Vars,CVals,Acc,Prod), computes product of SomeX factors given X=Val,Vars=CVals*/ /*---------------------------------------------------------*/ mult_vals([],_,_,_,_,P,P) :- !. mult_vals([Tree|SomeX],Val,X,Vars,CVals,P0,Prod) :- lookup(X,Val,Vars,CVals,Tree,ContextVal), P1 is P0*ContextVal, mult_vals(SomeX,Val,X,Vars,CVals,P1,Prod). /*---------------------------------------------------------*/ /* lookup(Var0,Val0,Vars,Vals,dtree(DVars,DTree),Prob) DVars is a subset of Vars U {Var}. Returns the value Prob by looking up "Var0=Val0 & Vars=Vals" in DTree. It assumes that the elements of Vars and TreeVars are ordered consistently. */ /*---------------------------------------------------------*/ lookup(_,_,[],[],dtree([],P),P). lookup(Var0,Val0,[Var|RVars],[Val|RVals], dtree([Var|TVars],DTree),Prob) :- !, values(Var,Vals), select_corresp_elt(Vals,Val,DTree,Subtree), lookup(Var0,Val0,RVars,RVals,dtree(TVars,Subtree),Prob). lookup(Var0,Val0,RVars,RVals,dtree([Var0|TVars],DTree),Prob):- !, values(Var0,Vals), select_corresp_elt(Vals,Val0,DTree,Subtree), lookup(Var0,Val0,RVars,RVals,dtree(TVars,Subtree),Prob). lookup(Var0,Val0,[_|RVars],[_|RVals],DT,Prob) :- lookup(Var0,Val0,RVars,RVals,DT,Prob). /*---------------------------------------------------------*/ /* collectt(Dist,DT) multiplies all of the factors together forming a DTRee. This assumes that all of the factors contain just the query variable */ /*---------------------------------------------------------*/ collectt([dtree(_,DT)],DT) :- !. collectt([dtree(_,DT0)|R],DT2) :- collectt(R ,DT1), multiply_corresp_elts(DT0,DT1,DT2). /*---------------------------------------------------------*/ /* multiply_corresp_elts(DT0,DT1,DT2) DT2 is the dot product of DT0 and DT1 */ /*---------------------------------------------------------*/ multiply_corresp_elts([],[],[]). multiply_corresp_elts([E0|L0],[E1|L1],[E2|L2]) :- E2 is E0*E1, multiply_corresp_elts(L0,L1,L2). /*---------------------------------------------------------*/ /* normalize(List,CumVal,Sum,NList) makes NList the same a list, but where elements sum to 1. Sum is the sum of all of the list, and CumVal is the accumulated sum to this point. */ /*---------------------------------------------------------*/ normalize([],S,S,[]). normalize([A|L],CV,Sum,[AN|LN]) :- CV1 is CV + A, normalize(L,CV1,Sum,LN), AN is A/Sum. /*---------------------------------------------------------*/ /* ordered_union(L0,L1,R,RL) is true if R = L0 U L1, where RL is a reference list that provides the ordering of elements. L0, L1, RL must all be bound. */ /*---------------------------------------------------------*/ ordered_union([],L,L,_) :- !. ordered_union(L,[],L,_) :- !. ordered_union([E|L0],[E|L1],[E|R],[E|RL]) :- !, ordered_union(L0,L1,R,RL). ordered_union([E|L0],L1,[E|R],[E|RL]) :- !, ordered_union(L0,L1,R,RL). ordered_union(L0,[E|L1],[E|R],[E|RL]) :- !, ordered_union(L0,L1,R,RL). ordered_union(L0,L1,R,[_|RL]) :- !, ordered_union(L0,L1,R,RL). /*---------------------------------------------------------*/ /* STANDARD DEFINITIONS */ /*---------------------------------------------------------*/ /* reverse(L,R) is true if R contains same elements as list L, in reverse order */ /*---------------------------------------------------------*/ reverse(L,R) :- rev(L,[],R). rev([],R,R). rev([H|T],Acc,R) :- rev(T,[H|Acc],R). /*---------------------------------------------------------*/ /* remove(E,L,R) true if R is the list L with one occurrence of E removed */ /*---------------------------------------------------------*/ remove(E,[E|L],L). remove(E,[A|L],[A|R]) :- remove(E,L,R). /*---------------------------------------------------------*/ /* remove_if_present(E,L,R) true if R is the list L with one occurrence of E removed */ /*---------------------------------------------------------*/ remove_if_present(_,[],[]). remove_if_present(E,[E|L],L) :- !. remove_if_present(E,[A|L],[A|R]) :- remove_if_present(E,L,R). /*---------------------------------------------------------*/ /* Induction of Decision Trees */ /*---------------------------------------------------------*/ /*---------------------------------------------------------*/ /* Example format: example(ID, Class, [A=V,...]). */ /*---------------------------------------------------------*/ /* Converting document vectors into the example format: */ /* id3format(Vectors,IDF) - converts and stores */ /* examples in the Prolog database */ /* Use: */ /* Create a decision tree: ?- id3. */ /* Print tree: ?- showtree. */ /* Print rules: ?- listing(if). */ /*---------------------------------------------------------*/ /*---------------------------------------------------------*/ ?- op(100,fx,if). ?- op(99,xfy,then). /*---------------------------------------------------------*/ id3format(Vectors,IDF) :- retractall(example(_,_,_)), convert(Vectors,IDF). convert([],_) :- !. convert([ID-Class-E|L],IDF) :- add_terms(IDF,E,TE), assertz(example(ID,Class,TE)), convert(L,IDF). add_terms([],[],[]) :- !. add_terms([_-T|L1],[V|L2],[T=V|L3]) :- add_terms(L1,L2,L3). id3(Tr) :- retractall(node(_,_,_)), retractall(if _ then _), findall(N,example(N,_,_),E), example(_,_,L), !, get_attributes(L,A), idt(E,root,A,Tr), assert_rules, !. idt(E,Parent,_,Tr) :- length(E,Len), Len= '),write(Class). showtree(Parent,Pos) :- findall(Son,node(Son,_,Parent),L), Pos1 is Pos+2, show_list(L,Pos1). show_list([],_) :- !. show_list([N|T],Pos) :- node(N,Label,_), nl, tab(Pos), write(Label), showtree(N,Pos), show_list(T,Pos). /*-----------------------------------------------------------*/ /* Utilities */ /*-----------------------------------------------------------*/ corpus([],[]) :- !. corpus([F|T],W) :- see(F), tokenize(L), seen, corpus(T,V), append(L,V,W). firstn(0,_,[]) :- !. firstn(_,[],[]) :- !. firstn(N,[X|T],[X|V]) :- M is N-1, firstn(M,T,V). firstn1(0,_,[]) :- !. firstn1(_,[],[]) :- !. firstn1(N,[_-X|T],[X|V]) :- M is N-1, firstn1(M,T,V). wfreq(L,RSF) :- wf(L,F), keysort(F,SF), reverse(SF,RSF). wf([],[]) :- !. wf([X|T],[C-X|L]) :- count([X|T],X,R,C), !, wf(R,L). update_idf([],_,[]) :- !. update_idf([N-W|T],WL,[M-W|V]) :- memberchk(W,WL), !, M is N+1, update_idf(T,WL,V). update_idf([W|T],WL,[1-W|V]) :- memberchk(W,WL), !, update_idf(T,WL,V). update_idf([W|T],WL,[W|V]) :- update_idf(T,WL,V). calc_idf([],_,[]) :- !. calc_idf([N-W|T],D,[IDF-W|V]) :- IDF is log((1+D)/N), calc_idf(T,D,V). binvalues([],_,[]) :- !. binvalues([_-W|T],WL,[1|L]) :- memberchk(W,WL), !, binvalues(T,WL,L). binvalues([_|T],WL,[0|L]) :- binvalues(T,WL,L). termvalues([],_,[]) :- !. termvalues([IDF-W|T],WL,[V|L]) :- count(WL,W,_,N), !, length(WL,M), V is (N/M)*IDF, % Not Cornell SMART termvalues(T,WL,L). norm([],0) :- !. norm([V|T],N) :- norm(T,N1), N is V*V+N1. normalize(V,0,V) :- !. normalize([],_,[]) :- !. normalize([V|T],N,[NV|L]) :- NV is sqrt(V*V/N), normalize(T,N,L). sum([[X|T]],[X|T]) :- !. sum([[X|T]|L],S) :- sum(L,S1), sum2([X|T],S1,S). sum2([],[],[]) :- !. sum2([X|T],[Y|U],[Z|V]) :- Z is X+Y, sum2(T,U,V). divide([],_,[]) :- !. divide([V|T],D,[DV|L]) :- DV is V/D, divide(T,D,L). max([X],X) :- !. max([M-X|T],N-Y) :- max(T,K-Z), (M>K,N-Y=M-X; N-Y=K-Z), !. maximum([X],X) :- !. maximum([X/M|T],Y/N) :- maximum(T,Z/K), (M>K,Y/N=X/M ; Y/N=Z/K), !. pair([A|L],A,B) :- member(B,L). pair([_|L],A,B) :- pair(L,A,B). del(X,[X|T],T). del(X,[Y|T],[Y|L]) :- del(X,T,L). subset([],_) :- !. subset([X|T],L) :- member(X,L), !, subset(T,L). intersection([],_,[]) :- !. intersection([X|T],L,[X|V]) :- member(X,L), !, intersection(T,L,V). intersection([_|T],L,V) :- intersection(T,L,V). count([],_,[],0) :- !. count([X|T],X,R,N) :- !, count(T,X,R,M), N is M+1. count([X|T],Y,[X|R],N) :- !, count(T,Y,R,N). ppl([]) :- !. ppl([X|T]) :- writeln(X), ppl(T). gen_name(M) :- retract(nam(N)), M is N+1, assert(nam(M)), !. gen_name(1) :- assert(nam(1)). efface(X,[X|T],T) :- !. efface(X,[Y|T],[Y|Z]) :- efface(X,T,Z). /*-------------- Reading and tokenizing text ----------------*/ tokenize([]) :- peek_byte(-1), !. tokenize(T) :- peek_byte(C), \+ charclass(C,_), !, get0(_), tokenize(T). tokenize(R) :- getword(L), name(W,L), (stopword(W),R=T; R=[W|T]), !, tokenize(T). getword([X|T]) :- get0(C), charclass(C,X), !, getword(T). getword([]). /*-----------------------------------------------------------*/ /* Character classes accepted by tokenize */ /*-----------------------------------------------------------*/ charclass(C,C) :- C>47, C<58 , !. % digit charclass(C,C) :- C>96, C<123 , !. % lower case charclass(C,U) :- C>64, C<91 , U is C+32, !. % upper case /*-----------------------------------------------------------*/ /* Stopwords (words ignored by tokenize) */ /*-----------------------------------------------------------*/ stopword(W) :- name(W,[_]), !. stopword(a). stopword(about). stopword(above). stopword(across). stopword(after). stopword(afterwards). stopword(again). stopword(against). stopword(all). stopword(almost). stopword(alone). stopword(along). stopword(already). stopword(also). stopword(although). stopword(always). stopword(am). stopword(among). stopword(amongst). stopword(amoungst). stopword(amount). stopword(an). stopword(and). stopword(another). stopword(any). stopword(anyhow). stopword(anyone). stopword(anything). stopword(anyway). stopword(anywhere). stopword(are). stopword(around). stopword(as). stopword(at). stopword(back). stopword(be). stopword(became). stopword(because). stopword(become). stopword(becomes). stopword(becoming). stopword(been). stopword(before). stopword(beforehand). stopword(behind). stopword(being). stopword(below). stopword(beside). stopword(besides). stopword(between). stopword(beyond). stopword(bill). stopword(both). stopword(bottom). stopword(but). stopword(by). stopword(call). stopword(can). stopword(cannot). stopword(cant). stopword(co). stopword(con). stopword(could). stopword(couldnt). stopword(cry). stopword(de). stopword(describe). stopword(detail). stopword(do). stopword(done). stopword(down). stopword(due). stopword(during). stopword(each). stopword(eg). stopword(eight). stopword(either). stopword(eleven). stopword(else). stopword(elsewhere). stopword(empty). stopword(enough). stopword(etc). stopword(even). stopword(ever). stopword(every). stopword(everyone). stopword(everything). stopword(everywhere). stopword(except). stopword(few). stopword(fifteen). stopword(fify). stopword(fill). stopword(find). stopword(fire). stopword(first). stopword(five). stopword(for). stopword(former). stopword(formerly). stopword(forty). stopword(found). stopword(four). stopword(from). stopword(front). stopword(full). stopword(further). stopword(get). stopword(give). stopword(go). stopword(had). stopword(has). stopword(hasnt). stopword(have). stopword(he). stopword(hence). stopword(her). stopword(here). stopword(hereafter). stopword(hereby). stopword(herein). stopword(hereupon). stopword(hers). stopword(herself). stopword(him). stopword(himself). stopword(his). stopword(how). stopword(however). stopword(hundred). stopword(i). stopword(ie). stopword(if). stopword(in). stopword(inc). stopword(indeed). stopword(interest). stopword(into). stopword(is). stopword(it). stopword(its). stopword(itself). stopword(keep). stopword(last). stopword(latter). stopword(latterly). stopword(least). stopword(less). stopword(ltd). stopword(made). stopword(many). stopword(may). stopword(me). stopword(meanwhile). stopword(might). stopword(mill). stopword(mine). stopword(more). stopword(moreover). stopword(most). stopword(mostly). stopword(move). stopword(much). stopword(must). stopword(my). stopword(myself). stopword(name). stopword(namely). stopword(nearby). stopword(neither). stopword(never). stopword(nevertheless). stopword(next). stopword(nine). stopword(no). stopword(nobody). stopword(none). stopword(noone). stopword(nor). stopword(not). stopword(nothing). stopword(now). stopword(nowhere). stopword(of). stopword(off). stopword(often). stopword(on). stopword(once). stopword(one). stopword(only). stopword(onto). stopword(or). stopword(other). stopword(others). stopword(otherwise). stopword(our). stopword(ours). stopword(ourselves). stopword(out). stopword(over). stopword(own). stopword(part). stopword(per). stopword(perhaps). stopword(please). stopword(put). stopword(rather). stopword(re). stopword(same). stopword(see). stopword(seem). stopword(seemed). stopword(seeming). stopword(seems). stopword(serious). stopword(several). stopword(she). stopword(should). stopword(show). stopword(side). stopword(since). stopword(sincere). stopword(six). stopword(sixty). stopword(so). stopword(some). stopword(somehow). stopword(someone). stopword(something). stopword(sometime). stopword(sometimes). stopword(somewhere). stopword(still). stopword(such). stopword(take). stopword(ten). stopword(than). stopword(that). stopword(the). stopword(their). stopword(them). stopword(themselves). stopword(then). stopword(thence). stopword(there). stopword(thereafter). stopword(thereby). stopword(therefore). stopword(therein). stopword(thereupon). stopword(these). stopword(they). stopword(thick). stopword(thin). stopword(third). stopword(this). stopword(those). stopword(though). stopword(three). stopword(through). stopword(throughout). stopword(thru). stopword(thus). stopword(to). stopword(together). stopword(too). stopword(top). stopword(toward). stopword(towards). stopword(twelve). stopword(twenty). stopword(two). stopword(un). stopword(under). stopword(until). stopword(up). stopword(upon). stopword(us). stopword(very). stopword(via). stopword(was). stopword(we). stopword(well). stopword(were). stopword(what). stopword(whatever). stopword(when). stopword(whence). stopword(whenever). stopword(where). stopword(whereafter). stopword(whereas). stopword(whereby). stopword(wherein). stopword(whereupon). stopword(wherever). stopword(whether). stopword(which). stopword(while). stopword(whither). stopword(who). stopword(whoever). stopword(whole). stopword(whom). stopword(whose). stopword(why). stopword(will). stopword(with). stopword(within). stopword(without). stopword(would). stopword(yet). stopword(you). stopword(your). stopword(yours). stopword(yourself). stopword(yourselves).