#encoding:latin-1 from pylab import * from numpy import * from py_nnma import * import sys, zipfile, time #### analyse scanned zipcode digits ########################################## # # source: # # http://www-stat-class.stanford.edu/~tibs/ElemStatLearn/data.html # # Normalized handwritten digits, automatically scanned from # envelopes by the U.S. Postal Service. The original scanned digits # are binary and of different sizes and orientations; the images # here have been deslanted and size normalized, resulting in 16 x # 16 grayscale images (Le Cun et al., 1990). def get_lines(): """ iterate over lines in zipfile """ lines = zipfile.ZipFile("zip.train.zip").read("zip.train")\ .split("\n") while len(lines): line = lines.pop() if line: yield line raise StopIteration # iterate over zipped digits file and collect data ########################### data = {} for line in get_lines(): fields = line.split() values = map(float,fields[1:]) id = int(float(fields[0])) data.setdefault(id,[]).append(values) # sample 50 images per digit ################################################# all_data =[] for digits in data.values(): random.shuffle(digits) all_data.extend(digits[:50]) print "got", len(all_data), "images" # build full matrix, one image per column #################################### full_matrix = array(all_data).T + 1.001 # scale to positive values # K1 rows, K2 columns ######################################################## K1=6 K2=5 def one_run(name, fun, pl, pd ): print print "RUN", name print time.asctime() started = time.time() A, X, res, count, converged = fun(*pl, **pd) print time.asctime() print "needed %d iterations and %d seconds" % (count, time.time()-started) print "residual is %e" % res print "count is", count print "converged is", converged print #B = sort_by_energy(B) figure() for i, col in enumerate(A.T): ax=subplot(K1,K2,i+1) imshow(col.reshape((16,16))**1, cmap=cm.gray) xticks([]) yticks([]) savefig("digits_%s.png" % name.lower()) def run_rri(): one_run("RRI", RRI, [full_matrix], dict(k = K1*K2, eps=1e-5, maxcount=999, verbose=10, psi=1e-3)) def run_fnmai(): one_run("FNMAI", FNMAI, [full_matrix], dict(k=K1*K2, tau=2, stabil=1e-6, maxcount=300, eps=1e-5, alpha = .1, verbose=10)) def run_fnmai_sparse(): one_run("FNMAI_SPARSE", FNMAI_SPARSE, [full_matrix], dict(k=K1*K2, tau=2, stabil=1e-6, regul=1e-3, maxcount=300, eps=1e-5, alpha = .1, verbose=10)) def run_snmf(): one_run("SNMF", SNMF, [full_matrix], dict(k=K1*K2, eps=1e-5, maxcount=999, verbose=100, sparse_par = 100.0)) def run_nmf(): one_run("NMF", NMF, [full_matrix], dict(k=K1*K2, eps=1e-5, maxcount=999, verbose=100)) def run_nmfkl(): one_run("NMFKL", NMFKL, [full_matrix], dict(k=K1*K2, eps=1e-5, maxcount=999, verbose=100)) def run_als(): one_run("ALS", ALS, [full_matrix], dict(k=K1*K2, eps=1e-5, maxcount=999, verbose=10, regul=.1)) def run_gdcls(): one_run("GDCLS", GDCLS, [full_matrix], dict(k=K1*K2, eps=1e-5, maxcount=999, verbose=10, regul=1e-3)) def run_gdcls_l1(): one_run("GDCLS_L1", GDCLS_L1, [full_matrix], dict(k=K1*K2, eps=1e-5, maxcount=999, verbose=10, regul=5e-2)) methods = ["NMF", "SNMF", "FNMAI", "RRI", "ALS", "GDCLS", "GDCLS_L1", "NMFKL", "FNMAI_SPARSE" ] if len(sys.argv)>1: methods = [ m.upper() for m in sys.argv[1:] ] print "I will run ", " ".join(methods) for m in methods: dict(NMF=run_nmf, SNMF=run_snmf, FNMAI=run_fnmai, RRI=run_rri, ALS=run_als, \ GDCLS = run_gdcls, GDCLS_L1 = run_gdcls_l1, NMFKL = run_nmfkl, \ FNMAI_SPARSE = run_fnmai_sparse)[m]() show()