#encoding:latin-1 from py_nnma import * import numpy as np from pylab import * import math, pprint, time import cPickle A = cPickle.load(file("documents.dat","rb")) # # A contains documents belonging to one of two topics. # nnma of A is used to find membership of each document # m,n = A.shape print "loaded document matrix with %d documents and %d terms" % (n, m) print def NCW(X): """ nc weighted form of document matrix from: "Document clustering based on non-negative matrix factorization" Wei Xu, Xin Liu, Yihong Gong Proceedings of the 26th annual international ACM SIGIR conference on Research and development in informaion retrieval 2003, pages 267 - 273 http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.117.2293 """ m, n = X.shape XXTe = X.transpose()*X * np.ones(n,) XXTe[XXTe==0] = 1.0 D = np.diag(XXTe**(-.5)) D[np.isinf(D)] = 1.0 D[np.isnan(D)] = 1.0 return X * D def scatt(c1, c2, t=""): """ scatter plot of vectors c1 and c2 colors are plottet corresponding to topic membership further a decision line is drawn """ # first 114 docs are from "topic 1", the remaining are "topic 2" ndocs_topic_one = 114 offs = .0 if sum(c1*c1): c1 /= np.linalg.norm(c1) if sum(c2*c2): c2 /= np.linalg.norm(c2) figure() title(t) for i, (v1,v2) in enumerate(zip(c1,c2)): if i < ndocs_topic_one: plot([v1+offs], [v2+offs],'r.') else: plot([v1+offs], [v2+offs],'g.') plot([0,max(c1)+offs],[0,max(c2)+offs], 'k:') axis([-.01, .2, -.01, .2]) print "bild nc weighted document matrix" A=NCW(A) print time.asctime() B, C, obj, _, _ = GDCLS_L1(A, 2, verbose=1) print time.asctime() print scatt(C[0,:], C[1,:]) show()